diff --git a/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt b/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt new file mode 100644 index 0000000000000000000000000000000000000000..240d599e2ca6c69f48c40f9bfe25394b25b7f367 --- /dev/null +++ b/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5fed29f7cdafd92543a361433dc4cb9945ec9b57b8eefd1bd90261575ee64f27 +size 1646767740 diff --git a/c4_original-d=1024_l=24_h=8-0.25/params.txt b/c4_original-d=1024_l=24_h=8-0.25/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..04ab5b74c9282d1d5997b120acbcb9a11c752da8 --- /dev/null +++ b/c4_original-d=1024_l=24_h=8-0.25/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=1024_l=24_h=8-0.25/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/c4_original-d=1024_l=24_h=8-0.25/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=1024_l=24_h=8 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: c4_original-d=1024_l=24_h=8-0.25 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 2000 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt b/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..641ed3bc74d72e67735fdca94bd2c6112fc3539e --- /dev/null +++ b/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f4fb6936004ce754dc68494dd4cf1676204b974525767026685f71cbf6b1bba +size 1646767740 diff --git a/c4_original-d=1024_l=24_h=8-0.5/params.txt b/c4_original-d=1024_l=24_h=8-0.5/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..d1dcbdee0794fe6300a81f1fe41e52740c4a3b14 --- /dev/null +++ b/c4_original-d=1024_l=24_h=8-0.5/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=1024_l=24_h=8-0.5/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/c4_original-d=1024_l=24_h=8-0.5/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=1024_l=24_h=8 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: c4_original-d=1024_l=24_h=8-0.5 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 2000 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt b/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..9789e957e7b68117d53b7cc1e4e8849f187f7610 --- /dev/null +++ b/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e22cd01e23e35977c64e6d10b0b1150f143eecd4b677c314899ff636427504b +size 1646767740 diff --git a/c4_original-d=1024_l=24_h=8-1.0/params.txt b/c4_original-d=1024_l=24_h=8-1.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..7baaaaf393da133d4c3832f4ca034ed21b689b4a --- /dev/null +++ b/c4_original-d=1024_l=24_h=8-1.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=1024_l=24_h=8-1.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/c4_original-d=1024_l=24_h=8-1.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=1024_l=24_h=8 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: c4_original-d=1024_l=24_h=8-1.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 2000 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt b/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..818a7ae8bfd4823f955fb08d43f6e7d7463fbbba --- /dev/null +++ b/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0384d69859b449de6fdc8484653b43988d4775673f39f58788190f640ecdf616 +size 1646767036 diff --git a/c4_original-d=1024_l=24_h=8-16.0/params.txt b/c4_original-d=1024_l=24_h=8-16.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..c0edde8f3257f033d0f86debc0059e9941b31516 --- /dev/null +++ b/c4_original-d=1024_l=24_h=8-16.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 2 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: logs/26439/c4_original-d=1024_l=24_h=8-16.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 32 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: logs/26439/c4_original-d=1024_l=24_h=8-16.0/out.log +logs: logs/26439 +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=1024_l=24_h=8 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: c4_original-d=1024_l=24_h=8-16.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 8 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 2000 +wd: 0.033 +workers: 2 +world_size: 2 +z_loss_coefficient: 0.0001 diff --git a/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt b/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..e2a2dff84c7d730ba989547dc2a6cab00d8e45bd --- /dev/null +++ b/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c27cfcf448d1f1db45b6fb4a5902f57879f36603394399768c141262ebd2a56 +size 1646767740 diff --git a/c4_original-d=1024_l=24_h=8-2.0/params.txt b/c4_original-d=1024_l=24_h=8-2.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..20e610409dabf741454a405acd467786418a303d --- /dev/null +++ b/c4_original-d=1024_l=24_h=8-2.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=1024_l=24_h=8-2.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/c4_original-d=1024_l=24_h=8-2.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=1024_l=24_h=8 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: c4_original-d=1024_l=24_h=8-2.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 2000 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt b/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..f1337747559acb556ac9b91385b0aff0502700a2 --- /dev/null +++ b/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:836fd49f4007798516f011caa995547fcbe637c2c28b28b6c518bf3bdc920369 +size 1646766972 diff --git a/c4_original-d=1024_l=24_h=8-4.0/params.txt b/c4_original-d=1024_l=24_h=8-4.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..c6de742b2066ff7f7a68ffea7264174ca18278dc --- /dev/null +++ b/c4_original-d=1024_l=24_h=8-4.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 2 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: logs/25614/c4_original-d=1024_l=24_h=8-4.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 64 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: logs/25614/c4_original-d=1024_l=24_h=8-4.0/out.log +logs: logs/25614 +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=1024_l=24_h=8 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: c4_original-d=1024_l=24_h=8-4.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 8 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 2000 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt b/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..c6ca92e143876a084c153981c6c38f283dc0760c --- /dev/null +++ b/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6380471557fc25d8fd64fd3f833a4285f13244c3b7ee89f3820bf6236a5d6b2 +size 1646767740 diff --git a/c4_original-d=1024_l=24_h=8-8.0/params.txt b/c4_original-d=1024_l=24_h=8-8.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..6c703632008f2d45622e545d0a21455a400e40f9 --- /dev/null +++ b/c4_original-d=1024_l=24_h=8-8.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=1024_l=24_h=8-8.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/c4_original-d=1024_l=24_h=8-8.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=1024_l=24_h=8 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: c4_original-d=1024_l=24_h=8-8.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 2000 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt b/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt new file mode 100644 index 0000000000000000000000000000000000000000..5c295a7e48d65c0cf1c09ee1d83db426d5a6e897 --- /dev/null +++ b/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:294960f1a93aee07fa9972ac71eb2b10254b4f2fa20c32baf74342668fdd2274 +size 315725493 diff --git a/c4_original-d=512_l=8_h=4-0.25/params.txt b/c4_original-d=512_l=8_h=4-0.25/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..e129538bcdbfc865f466cb85f209c24e29381bc3 --- /dev/null +++ b/c4_original-d=512_l=8_h=4-0.25/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=512_l=8_h=4-0.25/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/c4_original-d=512_l=8_h=4-0.25/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=512_l=8_h=4 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: c4_original-d=512_l=8_h=4-0.25 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 400 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt b/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt new file mode 100644 index 0000000000000000000000000000000000000000..3bcc377b66d00f2a6c67eebdb44701d45cca0425 --- /dev/null +++ b/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7566deea2ebb5dfeb2d568f80f9a8829fc500aa14e8a30ffc21f01aa57bd734 +size 315725493 diff --git a/c4_original-d=512_l=8_h=4-0.5/params.txt b/c4_original-d=512_l=8_h=4-0.5/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..fca173ed7ede36c4582823129def7e0fda7cd97f --- /dev/null +++ b/c4_original-d=512_l=8_h=4-0.5/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=512_l=8_h=4-0.5/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/c4_original-d=512_l=8_h=4-0.5/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=512_l=8_h=4 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: c4_original-d=512_l=8_h=4-0.5 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 400 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/c4_original-d=512_l=8_h=4-1.0/checkpoints/epoch_6.pt b/c4_original-d=512_l=8_h=4-1.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..5dd11107de63b2764aac0e1d4696ee4e13371abe --- /dev/null +++ b/c4_original-d=512_l=8_h=4-1.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4f740c0d4f4bc9e11d6ca96facab4970997ca2c12f307c59c410f98ff0f66a3 +size 315725493 diff --git a/c4_original-d=512_l=8_h=4-1.0/params.txt b/c4_original-d=512_l=8_h=4-1.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..37929dd18b6c38f3d8863b6be716b687a0e6a44e --- /dev/null +++ b/c4_original-d=512_l=8_h=4-1.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=512_l=8_h=4-1.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/c4_original-d=512_l=8_h=4-1.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=512_l=8_h=4 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: c4_original-d=512_l=8_h=4-1.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=512_l=8_h=4-1.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 400 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt b/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..7d01cb8ee561f129458cef937b1e71def622c823 --- /dev/null +++ b/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d24828a5e1cca5e3a65fce61fe9ecd72ea7d8e09c6658ca07768a5a3fc6978a5 +size 315725557 diff --git a/c4_original-d=512_l=8_h=4-16.0/params.txt b/c4_original-d=512_l=8_h=4-16.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..7df580ffff068e1e4d9ace1f5bcda29b6de6420d --- /dev/null +++ b/c4_original-d=512_l=8_h=4-16.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=512_l=8_h=4-16.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/c4_original-d=512_l=8_h=4-16.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=512_l=8_h=4 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: c4_original-d=512_l=8_h=4-16.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 400 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt b/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..2b969667f926eba193e1067f5ecbd1cf2d3a8899 --- /dev/null +++ b/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16a65618e30f4d1ae030457c00fe3159becb284c2c7d90fa518eebc4d3dfeb94 +size 315725557 diff --git a/c4_original-d=512_l=8_h=4-2.0/params.txt b/c4_original-d=512_l=8_h=4-2.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..f47d7718dbd6e4c919955b7767a4406e1723b73a --- /dev/null +++ b/c4_original-d=512_l=8_h=4-2.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=512_l=8_h=4-2.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/c4_original-d=512_l=8_h=4-2.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=512_l=8_h=4 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: c4_original-d=512_l=8_h=4-2.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 400 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/c4_original-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt b/c4_original-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..94f27f15a91e9d928394b1d4bf01e90897974069 --- /dev/null +++ b/c4_original-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10d08659e80b72a44718bfe98c2ecde3964c62c3177ac5d4e15a09738520b602 +size 315725557 diff --git a/c4_original-d=512_l=8_h=4-32.0/params.txt b/c4_original-d=512_l=8_h=4-32.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..95e1ef32b7345ca5bea28beb3a94f597b0e2eb08 --- /dev/null +++ b/c4_original-d=512_l=8_h=4-32.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=512_l=8_h=4-32.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/c4_original-d=512_l=8_h=4-32.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=512_l=8_h=4 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: c4_original-d=512_l=8_h=4-32.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 400 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt b/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..535b1173298398836b3ac969e77f74c9e1e5f081 --- /dev/null +++ b/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2eb1fdc413c0b694f10375240e796fb6b27ed9207eff96d21967ea5abd2d776 +size 315725557 diff --git a/c4_original-d=512_l=8_h=4-4.0/params.txt b/c4_original-d=512_l=8_h=4-4.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..1b7debfdd6f2a2ec12b56b5256d35e8da932cf6a --- /dev/null +++ b/c4_original-d=512_l=8_h=4-4.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=512_l=8_h=4-4.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/c4_original-d=512_l=8_h=4-4.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=512_l=8_h=4 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: c4_original-d=512_l=8_h=4-4.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 400 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt b/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..79c42633176dfbcc88af2a81b0b60f04c2bcfbce --- /dev/null +++ b/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:807c93b06719d7f853ff3a645115e177e0e70bd12196371750d03e38c4fc7683 +size 315725557 diff --git a/c4_original-d=512_l=8_h=4-8.0/params.txt b/c4_original-d=512_l=8_h=4-8.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..483565d2b5fa74fa176ed9f80a11516ee5d1851c --- /dev/null +++ b/c4_original-d=512_l=8_h=4-8.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=512_l=8_h=4-8.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/c4_original-d=512_l=8_h=4-8.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=512_l=8_h=4 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: c4_original-d=512_l=8_h=4-8.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 400 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt b/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt new file mode 100644 index 0000000000000000000000000000000000000000..341ac785a3fcd0d441aa533d1fb15f8a70ad9290 --- /dev/null +++ b/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae6efb2482178dde290567c7514d882f8b00bf67df82690a6984c912aca5d065 +size 614923196 diff --git a/c4_original-d=576_l=24_h=8-0.25/params.txt b/c4_original-d=576_l=24_h=8-0.25/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..904c2ba7fdf674b1c9a729a18047fb677ffa82b0 --- /dev/null +++ b/c4_original-d=576_l=24_h=8-0.25/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=576_l=24_h=8-0.25/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/c4_original-d=576_l=24_h=8-0.25/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=576_l=24_h=8 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: c4_original-d=576_l=24_h=8-0.25 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 400 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt b/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..74c5d989a8281ca3b341476d78357119a672b590 --- /dev/null +++ b/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78a83ba24d2518ce88b3349fddd4a687ea23e1a65b22bba5556643bc5e70d704 +size 614923196 diff --git a/c4_original-d=576_l=24_h=8-0.5/params.txt b/c4_original-d=576_l=24_h=8-0.5/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..3e2a02255c5804188c08bbe5856268be895d8fb2 --- /dev/null +++ b/c4_original-d=576_l=24_h=8-0.5/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=576_l=24_h=8-0.5/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/c4_original-d=576_l=24_h=8-0.5/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=576_l=24_h=8 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: c4_original-d=576_l=24_h=8-0.5 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 400 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt b/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..75899e503004fe1d56f505f7ce06003922a3821d --- /dev/null +++ b/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efa37e9abd5ba9f09b8d28810350e004ade0efbb426e3f6fa364b6d630300192 +size 614923196 diff --git a/c4_original-d=576_l=24_h=8-1.0/params.txt b/c4_original-d=576_l=24_h=8-1.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..f6c7cd2e43ed4db25889fa983c8e05d955403499 --- /dev/null +++ b/c4_original-d=576_l=24_h=8-1.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=576_l=24_h=8-1.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/c4_original-d=576_l=24_h=8-1.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=576_l=24_h=8 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: c4_original-d=576_l=24_h=8-1.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 400 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt b/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..9de7782c5faeb2906cc0feff1cdaa19138073ba1 --- /dev/null +++ b/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:796d153dd609535364be9901f88d14361ef01a58c38d1b0cfbffa81f7d1e359a +size 614922428 diff --git a/c4_original-d=576_l=24_h=8-16.0/params.txt b/c4_original-d=576_l=24_h=8-16.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..d5a93fcccd01abd050d206090c9ae08282b6e3af --- /dev/null +++ b/c4_original-d=576_l=24_h=8-16.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: logs/186/c4_original-d=576_l=24_h=8-16.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 32 +global_val_batch_size: 4 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: logs/186/c4_original-d=576_l=24_h=8-16.0/out.log +logs: logs/186 +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=576_l=24_h=8 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: c4_original-d=576_l=24_h=8-16.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 400 +wd: 0.033 +workers: 2 +world_size: 2 +z_loss_coefficient: 0.0001 diff --git a/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt b/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..e4ed07867237d399a74a869aa2d3526b1023b625 --- /dev/null +++ b/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16c94bfeadfe8bff32399f264e582786f207d34e36a1f12598a6cf55abe0f10b +size 614923196 diff --git a/c4_original-d=576_l=24_h=8-2.0/params.txt b/c4_original-d=576_l=24_h=8-2.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..7b159fa5eaceadd8c98a160967a7adf26a407157 --- /dev/null +++ b/c4_original-d=576_l=24_h=8-2.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=576_l=24_h=8-2.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/c4_original-d=576_l=24_h=8-2.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=576_l=24_h=8 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: c4_original-d=576_l=24_h=8-2.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 400 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt b/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..2b0fa94ab9f2efd825455bca98cc3223192ce872 --- /dev/null +++ b/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a495a3c15e7f598d2d404622514f8f6a0cd540f85b0d42593b086fa135314c57 +size 614923196 diff --git a/c4_original-d=576_l=24_h=8-32.0/params.txt b/c4_original-d=576_l=24_h=8-32.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..58643d4f52b997aeb567e6143440be8adbe1e06e --- /dev/null +++ b/c4_original-d=576_l=24_h=8-32.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=576_l=24_h=8-32.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/c4_original-d=576_l=24_h=8-32.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=576_l=24_h=8 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: c4_original-d=576_l=24_h=8-32.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 400 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/c4_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt b/c4_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..606fe632cb6afdf7dde7155d369868f1cb097d66 --- /dev/null +++ b/c4_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f09b35d1b0c8c188186f63419cb560d473f485cd79940587b562e7a82657c0f1 +size 614923196 diff --git a/c4_original-d=576_l=24_h=8-4.0/params.txt b/c4_original-d=576_l=24_h=8-4.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..71271ccf9fe30ea9c22baa6537f55cc87a8c3e42 --- /dev/null +++ b/c4_original-d=576_l=24_h=8-4.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=576_l=24_h=8-4.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/c4_original-d=576_l=24_h=8-4.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=576_l=24_h=8 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: c4_original-d=576_l=24_h=8-4.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 400 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt b/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..594919a234cb6e08dae313e04b06b744cdb43519 --- /dev/null +++ b/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a674ef037b53e720f575e92c3b64d20ec04913276559633f03c1953f39854f3 +size 614922428 diff --git a/c4_original-d=576_l=24_h=8-8.0/params.txt b/c4_original-d=576_l=24_h=8-8.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..ebc8531837ed77c98a0a19d97c83ed8a11158ec1 --- /dev/null +++ b/c4_original-d=576_l=24_h=8-8.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 2 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: logs/787/c4_original-d=576_l=24_h=8-8.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 64 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: logs/787/c4_original-d=576_l=24_h=8-8.0/out.log +logs: logs/787 +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=576_l=24_h=8 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: c4_original-d=576_l=24_h=8-8.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 8 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 400 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt b/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt new file mode 100644 index 0000000000000000000000000000000000000000..6b4f65938a25ed587f4384debaee03966b7aa7b2 --- /dev/null +++ b/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3b8a4e13665af64f2b60e78412b823b4f1cf712f85f4736ea95aa1c56ec5057 +size 42317749 diff --git a/c4_original-d=96_l=8_h=4-0.25/params.txt b/c4_original-d=96_l=8_h=4-0.25/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..90094767f6c2da54a0bdf838e0291acc1dc04236 --- /dev/null +++ b/c4_original-d=96_l=8_h=4-0.25/params.txt @@ -0,0 +1,123 @@ +accum_freq: 1 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=96_l=8_h=4-0.25/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 128 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/c4_original-d=96_l=8_h=4-0.25/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=96_l=8_h=4 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: c4_original-d=96_l=8_h=4-0.25 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 16 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 100 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt b/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt new file mode 100644 index 0000000000000000000000000000000000000000..a761ab81a13991ac00bb0f3ab15239077228e6ef --- /dev/null +++ b/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be7b97a11887e036413637725f12a57a34f74d8220ab35d860dc9a86ccf71153 +size 42317749 diff --git a/c4_original-d=96_l=8_h=4-0.5/params.txt b/c4_original-d=96_l=8_h=4-0.5/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..da67afa17a77ffd4b1dd7fa300e6df9e4f501727 --- /dev/null +++ b/c4_original-d=96_l=8_h=4-0.5/params.txt @@ -0,0 +1,123 @@ +accum_freq: 1 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=96_l=8_h=4-0.5/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 128 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/c4_original-d=96_l=8_h=4-0.5/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=96_l=8_h=4 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: c4_original-d=96_l=8_h=4-0.5 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 16 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 100 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt b/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt new file mode 100644 index 0000000000000000000000000000000000000000..ab55cc6100240dec377708cf6ddc414463717949 --- /dev/null +++ b/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9845bcd43d1e6e30ed4f7bfca54e33e4344c82467a58a30a773107aa9eefe23c +size 42317749 diff --git a/c4_original-d=96_l=8_h=4-1.0/params.txt b/c4_original-d=96_l=8_h=4-1.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..7701c1a278f158c16a0b8949375e402ecad71f09 --- /dev/null +++ b/c4_original-d=96_l=8_h=4-1.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 1 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=96_l=8_h=4-1.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 128 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/c4_original-d=96_l=8_h=4-1.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=96_l=8_h=4 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: c4_original-d=96_l=8_h=4-1.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 16 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 100 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt b/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt new file mode 100644 index 0000000000000000000000000000000000000000..c14cc80b31f0b61a81532db461d6cc91e0e6e519 --- /dev/null +++ b/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38a9eca527674b483c6ed9b35479ac5473ee8b455ab0683b3a91b7ae8bff522c +size 42317749 diff --git a/c4_original-d=96_l=8_h=4-16.0/params.txt b/c4_original-d=96_l=8_h=4-16.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..15d40fa6b7d6b76dcbef6f88bce84c6a6c75eb81 --- /dev/null +++ b/c4_original-d=96_l=8_h=4-16.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 1 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=96_l=8_h=4-16.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 128 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/c4_original-d=96_l=8_h=4-16.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=96_l=8_h=4 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: c4_original-d=96_l=8_h=4-16.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 16 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 100 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt b/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt new file mode 100644 index 0000000000000000000000000000000000000000..2828321bcb59b787f2403e6b9da8161f9391226d --- /dev/null +++ b/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:492e5a95cec4577cb3bb16a9cfdb7d91481213d378aa04927623d397420f2070 +size 42317749 diff --git a/c4_original-d=96_l=8_h=4-2.0/params.txt b/c4_original-d=96_l=8_h=4-2.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..b694aeba8743531485cafc8bdd4cd77adfec4305 --- /dev/null +++ b/c4_original-d=96_l=8_h=4-2.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 1 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=96_l=8_h=4-2.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 128 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/c4_original-d=96_l=8_h=4-2.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=96_l=8_h=4 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: c4_original-d=96_l=8_h=4-2.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 16 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 100 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt b/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..35c7e9ee70ed2c4ef7b8623fcee159ea7a7545c9 --- /dev/null +++ b/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7aded689d75bd052f647f6a330289c5ec2ed55f42d234a5f66fa8b2df2d225f +size 42317749 diff --git a/c4_original-d=96_l=8_h=4-32.0/params.txt b/c4_original-d=96_l=8_h=4-32.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..a72539584dd07283bb2af72fd87d5f7e4594ca3c --- /dev/null +++ b/c4_original-d=96_l=8_h=4-32.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 1 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=96_l=8_h=4-32.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 128 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/c4_original-d=96_l=8_h=4-32.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=96_l=8_h=4 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: c4_original-d=96_l=8_h=4-32.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 16 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 100 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt b/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt new file mode 100644 index 0000000000000000000000000000000000000000..a0cb8d8e9dd2c6723cd2802291c3634cbab9a4fa --- /dev/null +++ b/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c0dd8bf9f1be543946167d5244bd58fff07734301e80e5d2eb54ac06bff4bf7 +size 42317749 diff --git a/c4_original-d=96_l=8_h=4-4.0/params.txt b/c4_original-d=96_l=8_h=4-4.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..174d53a542ddbefc0f73ddc6db18f336f5ae5d5b --- /dev/null +++ b/c4_original-d=96_l=8_h=4-4.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 1 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=96_l=8_h=4-4.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 128 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/c4_original-d=96_l=8_h=4-4.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=96_l=8_h=4 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: c4_original-d=96_l=8_h=4-4.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 16 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 100 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/c4_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt b/c4_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt new file mode 100644 index 0000000000000000000000000000000000000000..9663dacb49f938eeb93d3594d6b48247fa2288f7 --- /dev/null +++ b/c4_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71a6861bb2519ad0ad68be819026a34eb049935122b8d75eccd1144b995342db +size 42317749 diff --git a/c4_original-d=96_l=8_h=4-8.0/params.txt b/c4_original-d=96_l=8_h=4-8.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..7eb1dcb66a3ec3f091336f9cd42001852c0afb8f --- /dev/null +++ b/c4_original-d=96_l=8_h=4-8.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 1 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=96_l=8_h=4-8.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 128 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/c4_original-d=96_l=8_h=4-8.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=96_l=8_h=4 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: c4_original-d=96_l=8_h=4-8.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 16 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 100 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/c4_original-open_lm_1b-1.0/checkpoints/epoch_6.pt b/c4_original-open_lm_1b-1.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..a4eb96048e1ec7394e5d40e4269272e5fe30e41c --- /dev/null +++ b/c4_original-open_lm_1b-1.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8ec7eccd05953fa5be62f9864eaece61e9b800dfa02bf1596cbf9af0502118d +size 5759681220 diff --git a/c4_original-open_lm_1b-1.0/params.txt b/c4_original-open_lm_1b-1.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..3aeba3ae823722c7e6191893d7c3c0fdc126f767 --- /dev/null +++ b/c4_original-open_lm_1b-1.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 2 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-open_lm_1b-1.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 64 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/c4_original-open_lm_1b-1.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: open_lm_1b +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: c4_original-open_lm_1b-1.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 8 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-open_lm_1b-1.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/de-en/val_de-en_000.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_010.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_020.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_030.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_040.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_050.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_060.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_070.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_080.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_090.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_100.tar'] +val_data_key: ['json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 5000 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/c4_original-open_lm_1b-4.0/checkpoints/epoch_6.pt b/c4_original-open_lm_1b-4.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..607c5a0f101294d576626c758b9c63eb2f1da739 --- /dev/null +++ b/c4_original-open_lm_1b-4.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f60946dc0838c6733de1e41bb68bfe233dd3503acfd5d3db0baefb9f233ba36 +size 5759681028 diff --git a/c4_original-open_lm_1b-4.0/params.txt b/c4_original-open_lm_1b-4.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..27f6dba6c01fbead67aff6d7658fee8937ed0d5d --- /dev/null +++ b/c4_original-open_lm_1b-4.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 1 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: logs/448/c4_original-open_lm_1b-4.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 128 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: logs/448/c4_original-open_lm_1b-4.0/out.log +logs: logs/448 +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: open_lm_1b +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: c4_original-open_lm_1b-4.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 16 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-open_lm_1b-4.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/de-en/val_de-en_000.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_010.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_020.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_030.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_040.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_050.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_060.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_070.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_080.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_090.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_100.tar'] +val_data_key: ['json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 5000 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt b/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt new file mode 100644 index 0000000000000000000000000000000000000000..d586f62f77c6ae149f45c136a69defbda56747e0 --- /dev/null +++ b/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f01dcc23d4a5f1258035587d719b48aa736bcee671bf8046c3289ae942ef6002 +size 1646767740 diff --git a/rpj-d=1024_l=24_h=8-0.25/params.txt b/rpj-d=1024_l=24_h=8-0.25/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..1b7cf2461515aad25c13e772ab23e039b4adb84a --- /dev/null +++ b/rpj-d=1024_l=24_h=8-0.25/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rpj-d=1024_l=24_h=8-0.25/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rpj-d=1024_l=24_h=8-0.25/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=1024_l=24_h=8 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rpj-d=1024_l=24_h=8-0.25 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 2000 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt b/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..7902083f0afe20dcb30f1c6686ed1516df1a9574 --- /dev/null +++ b/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bff07f96c9d110b1bbf8c607dbaf2b8a93aea363dfa0e647f53321842ced2f4d +size 1646767740 diff --git a/rpj-d=1024_l=24_h=8-0.5/params.txt b/rpj-d=1024_l=24_h=8-0.5/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..18be35ecd1d519a5a4879966fdbc2cf130b0402f --- /dev/null +++ b/rpj-d=1024_l=24_h=8-0.5/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rpj-d=1024_l=24_h=8-0.5/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rpj-d=1024_l=24_h=8-0.5/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=1024_l=24_h=8 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rpj-d=1024_l=24_h=8-0.5 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 2000 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt b/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..a3f59dfe6862770c1ccededf84e08966f9e0daa0 --- /dev/null +++ b/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c81736fe3ae79cf1c2bd9b3c774e9f4b8e0c0eba8d0b405cec369b6d666041f +size 1646767740 diff --git a/rpj-d=1024_l=24_h=8-1.0/params.txt b/rpj-d=1024_l=24_h=8-1.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..33b7aaae7757c4a6216d8f61b91686483176acd8 --- /dev/null +++ b/rpj-d=1024_l=24_h=8-1.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rpj-d=1024_l=24_h=8-1.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rpj-d=1024_l=24_h=8-1.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=1024_l=24_h=8 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rpj-d=1024_l=24_h=8-1.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 2000 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt b/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..e461efa994290ca1e34c2553e1d1227aab57017d --- /dev/null +++ b/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42b966f6de2ee16e5db6d5c7420326d230c46d48454d2ed18bfe8e7490adea3f +size 1646767036 diff --git a/rpj-d=1024_l=24_h=8-16.0/params.txt b/rpj-d=1024_l=24_h=8-16.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..fbb9177edf705bbf667d13a6f6b1ef81a12f77a6 --- /dev/null +++ b/rpj-d=1024_l=24_h=8-16.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 2 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: logs/17288/rpj-d=1024_l=24_h=8-16.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 32 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: logs/17288/rpj-d=1024_l=24_h=8-16.0/out.log +logs: logs/17288 +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=1024_l=24_h=8 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rpj-d=1024_l=24_h=8-16.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 8 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 2000 +wd: 0.033 +workers: 2 +world_size: 2 +z_loss_coefficient: 0.0001 diff --git a/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt b/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..d476864807fdeab5105ddf616aaa8feedb04af8c --- /dev/null +++ b/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72929598b09d70207cbf9d48cd8b99d4746e8531c2f37bac6afd886fce425a8f +size 1646767740 diff --git a/rpj-d=1024_l=24_h=8-2.0/params.txt b/rpj-d=1024_l=24_h=8-2.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..6b566cca8c7791370cfb38852d28a61aafa2f2a2 --- /dev/null +++ b/rpj-d=1024_l=24_h=8-2.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rpj-d=1024_l=24_h=8-2.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rpj-d=1024_l=24_h=8-2.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=1024_l=24_h=8 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rpj-d=1024_l=24_h=8-2.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 2000 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt b/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..92736f0cdd6b622d0272b5bc3cc96fd264208cac --- /dev/null +++ b/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2dabaa8ef8df8b07d81e574731cc0974b65010289e0d25da297141c5964f3cbb +size 1646767036 diff --git a/rpj-d=1024_l=24_h=8-32.0/params.txt b/rpj-d=1024_l=24_h=8-32.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..e5db0d8a38134522ae68c879bdd0c1714125b8b9 --- /dev/null +++ b/rpj-d=1024_l=24_h=8-32.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 2 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: logs/27127/rpj-d=1024_l=24_h=8-32.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 32 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: logs/27127/rpj-d=1024_l=24_h=8-32.0/out.log +logs: logs/27127 +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=1024_l=24_h=8 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rpj-d=1024_l=24_h=8-32.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 8 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 2000 +wd: 0.033 +workers: 2 +world_size: 2 +z_loss_coefficient: 0.0001 diff --git a/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt b/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..9cdf553641a9acd1699c70e07311925022894469 --- /dev/null +++ b/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2739681454be644ab4d50e58bca56e5b6cd7e53b94bbc659104312a890c318e0 +size 1646767740 diff --git a/rpj-d=1024_l=24_h=8-4.0/params.txt b/rpj-d=1024_l=24_h=8-4.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..5f0651520fd925f5aa91a107e141bb1b3439ef50 --- /dev/null +++ b/rpj-d=1024_l=24_h=8-4.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rpj-d=1024_l=24_h=8-4.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rpj-d=1024_l=24_h=8-4.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=1024_l=24_h=8 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rpj-d=1024_l=24_h=8-4.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 2000 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt b/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..73a14fa7f52f38af1167aaf2944fd5ce1e99aba0 --- /dev/null +++ b/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab692c8376fe89916fe303a4709aa8725e439c9dfa328df806270c9089f89a2f +size 1646767740 diff --git a/rpj-d=1024_l=24_h=8-8.0/params.txt b/rpj-d=1024_l=24_h=8-8.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..5286100615e55ddbac122dbe1e3677c6e67ef676 --- /dev/null +++ b/rpj-d=1024_l=24_h=8-8.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rpj-d=1024_l=24_h=8-8.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rpj-d=1024_l=24_h=8-8.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=1024_l=24_h=8 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rpj-d=1024_l=24_h=8-8.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 2000 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt b/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt new file mode 100644 index 0000000000000000000000000000000000000000..fec65d90df6accad29f3252a135396d8714351a6 --- /dev/null +++ b/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:534524981de7304b96b9505ed2df29bd34a82d89c729b38af8e0a1d1303abe7e +size 315725493 diff --git a/rpj-d=512_l=8_h=4-0.25/params.txt b/rpj-d=512_l=8_h=4-0.25/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..a66f6daf7dc1c89833cec2a10c99266ce5accd0e --- /dev/null +++ b/rpj-d=512_l=8_h=4-0.25/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rpj-d=512_l=8_h=4-0.25/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rpj-d=512_l=8_h=4-0.25/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=512_l=8_h=4 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rpj-d=512_l=8_h=4-0.25 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 400 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt b/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt new file mode 100644 index 0000000000000000000000000000000000000000..98619296cf991edd1e5b0a2acb67c54c9c94d920 --- /dev/null +++ b/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44e8ecbd720ae4ffba9de0574ed53aca5211300925720a8cbd05a5ed657a459a +size 315725493 diff --git a/rpj-d=512_l=8_h=4-0.5/params.txt b/rpj-d=512_l=8_h=4-0.5/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..f3259788426bbc9fd6d9da7ed2de9edf9241cbc6 --- /dev/null +++ b/rpj-d=512_l=8_h=4-0.5/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rpj-d=512_l=8_h=4-0.5/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rpj-d=512_l=8_h=4-0.5/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=512_l=8_h=4 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rpj-d=512_l=8_h=4-0.5 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 400 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt b/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt new file mode 100644 index 0000000000000000000000000000000000000000..743d01857d693a9e682b0c6dd65d652275e26216 --- /dev/null +++ b/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bbd2f43a38b0af716d799d615025fb36756373c4b289e9eca0b11889c72f752b +size 315725493 diff --git a/rpj-d=512_l=8_h=4-1.0/params.txt b/rpj-d=512_l=8_h=4-1.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..b6cbf86fa4b66073103c658ec2eab1a58569a917 --- /dev/null +++ b/rpj-d=512_l=8_h=4-1.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rpj-d=512_l=8_h=4-1.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rpj-d=512_l=8_h=4-1.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=512_l=8_h=4 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rpj-d=512_l=8_h=4-1.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 400 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt b/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..9385fda4a518c7fa827b01adbc1f1aee53e00de9 --- /dev/null +++ b/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0d12e5649cf78609a12405f587fe10f340792b36d9240cfa911f6c2d2bdad77 +size 315725493 diff --git a/rpj-d=512_l=8_h=4-16.0/params.txt b/rpj-d=512_l=8_h=4-16.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..36a94f1be361b85f916ea8d7d1608f70e8ad80b0 --- /dev/null +++ b/rpj-d=512_l=8_h=4-16.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rpj-d=512_l=8_h=4-16.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rpj-d=512_l=8_h=4-16.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=512_l=8_h=4 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rpj-d=512_l=8_h=4-16.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 400 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt b/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt new file mode 100644 index 0000000000000000000000000000000000000000..23f796280ad6a640022546491992ab8bbef92b42 --- /dev/null +++ b/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:949874cd98a79baf2841392cd4db823261578cfcf6f3ad2fa8f094cc24bfc1c9 +size 315725493 diff --git a/rpj-d=512_l=8_h=4-2.0/params.txt b/rpj-d=512_l=8_h=4-2.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..634fcffeb7e6208c297745417359abd47365b68a --- /dev/null +++ b/rpj-d=512_l=8_h=4-2.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rpj-d=512_l=8_h=4-2.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rpj-d=512_l=8_h=4-2.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=512_l=8_h=4 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rpj-d=512_l=8_h=4-2.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 400 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt b/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..1bd062eb8fffee23b70be9a1cbbf444edde8b099 --- /dev/null +++ b/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3d7777c2a5def92e9420593ebdfe3d7ac2e59517da63d9edaf83bdaebbba075 +size 315725493 diff --git a/rpj-d=512_l=8_h=4-32.0/params.txt b/rpj-d=512_l=8_h=4-32.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..296e57f8868ce3908a702a7e4e4c82c92000cae2 --- /dev/null +++ b/rpj-d=512_l=8_h=4-32.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rpj-d=512_l=8_h=4-32.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rpj-d=512_l=8_h=4-32.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=512_l=8_h=4 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rpj-d=512_l=8_h=4-32.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 400 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt b/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..d29e0be6dc4627c6b0f36122cf8a4b737015d710 --- /dev/null +++ b/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:873240b1e610280a103c43ce262247f2f77e4f07eed80782c1fa697ab96bebfa +size 315725493 diff --git a/rpj-d=512_l=8_h=4-4.0/params.txt b/rpj-d=512_l=8_h=4-4.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..5655ad87450e6258f77ba68da2111bb63dddf238 --- /dev/null +++ b/rpj-d=512_l=8_h=4-4.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rpj-d=512_l=8_h=4-4.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rpj-d=512_l=8_h=4-4.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=512_l=8_h=4 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rpj-d=512_l=8_h=4-4.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 400 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt b/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..8e650d7c82d90201eb0a0f748b6fa0ec8d58668b --- /dev/null +++ b/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5744062fda30af6859731a4a885938a10645a2717fcf43f1717e78914c3bb78f +size 315725493 diff --git a/rpj-d=512_l=8_h=4-8.0/params.txt b/rpj-d=512_l=8_h=4-8.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..3bf5131fe3df4adf995fd1cbd01b7b7c6b2d153d --- /dev/null +++ b/rpj-d=512_l=8_h=4-8.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rpj-d=512_l=8_h=4-8.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rpj-d=512_l=8_h=4-8.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=512_l=8_h=4 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rpj-d=512_l=8_h=4-8.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 400 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt b/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt new file mode 100644 index 0000000000000000000000000000000000000000..9a63b6fa07924381f7c6396863e0a0db0a7a8069 --- /dev/null +++ b/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d6c734e16e11370ef6435e353c08bac6e81b1738682bf1ac47886e57f8839eb +size 614923196 diff --git a/rpj-d=576_l=24_h=8-0.25/params.txt b/rpj-d=576_l=24_h=8-0.25/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..1ccf55bacc7be67abdcbb66495bd41b868964718 --- /dev/null +++ b/rpj-d=576_l=24_h=8-0.25/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rpj-d=576_l=24_h=8-0.25/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rpj-d=576_l=24_h=8-0.25/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=576_l=24_h=8 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rpj-d=576_l=24_h=8-0.25 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 400 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt b/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt new file mode 100644 index 0000000000000000000000000000000000000000..7f90530f95dd1e0575ddadff65986e5ed58ee35d --- /dev/null +++ b/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd1369f3385a7026a6973f3e7023787436f76558b1e6d198a81404a5e8830162 +size 614923196 diff --git a/rpj-d=576_l=24_h=8-0.5/params.txt b/rpj-d=576_l=24_h=8-0.5/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..6c8f107db9efe0feb4a009cdf50f66196d4f2b61 --- /dev/null +++ b/rpj-d=576_l=24_h=8-0.5/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rpj-d=576_l=24_h=8-0.5/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rpj-d=576_l=24_h=8-0.5/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=576_l=24_h=8 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rpj-d=576_l=24_h=8-0.5 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 400 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt b/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..567820cc1bd5cad83b30126cc80fff3c9f7635b3 --- /dev/null +++ b/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d383aac31c9712a9bd623ec85430ca9ce2d7984621d3057c924d596ccb5acf33 +size 614923196 diff --git a/rpj-d=576_l=24_h=8-1.0/params.txt b/rpj-d=576_l=24_h=8-1.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..d299c9ea78b12b927f839164a7548301122fd7a4 --- /dev/null +++ b/rpj-d=576_l=24_h=8-1.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rpj-d=576_l=24_h=8-1.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rpj-d=576_l=24_h=8-1.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=576_l=24_h=8 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rpj-d=576_l=24_h=8-1.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 400 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt b/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..c7a3c7272d351a1af0b8a1178a0e0c2ec07dea59 --- /dev/null +++ b/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38722ff64c541a7e01dc61a30204f39d810e40fc3d3d625c7acf7ec0574c4fce +size 614923196 diff --git a/rpj-d=576_l=24_h=8-16.0/params.txt b/rpj-d=576_l=24_h=8-16.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..1f7a451a0a43296aab97f53db58b44b39511feec --- /dev/null +++ b/rpj-d=576_l=24_h=8-16.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rpj-d=576_l=24_h=8-16.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rpj-d=576_l=24_h=8-16.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=576_l=24_h=8 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rpj-d=576_l=24_h=8-16.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 400 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt b/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..7bad2a1c8783bdad6c0dbb58fd40074c157ab779 --- /dev/null +++ b/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0b1164c8ec8b4fb6b764718894cbe5b4b3099218ea3ab1b4f007d3ff72a5e3e +size 614923196 diff --git a/rpj-d=576_l=24_h=8-2.0/params.txt b/rpj-d=576_l=24_h=8-2.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..0a1c4437513ac7e69fa066097bba43d99723fca1 --- /dev/null +++ b/rpj-d=576_l=24_h=8-2.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rpj-d=576_l=24_h=8-2.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rpj-d=576_l=24_h=8-2.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=576_l=24_h=8 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rpj-d=576_l=24_h=8-2.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 400 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt b/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..33a8e739853f66ef6acbccdcb8707f356bffcfbd --- /dev/null +++ b/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:080fc8dd5748e16a354cd40bd12b1de94968b87e6c992c72a97836c469d3fb67 +size 614923196 diff --git a/rpj-d=576_l=24_h=8-32.0/params.txt b/rpj-d=576_l=24_h=8-32.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..a8844d916f66ed349ef7e377a4866f670db8326c --- /dev/null +++ b/rpj-d=576_l=24_h=8-32.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rpj-d=576_l=24_h=8-32.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rpj-d=576_l=24_h=8-32.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=576_l=24_h=8 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rpj-d=576_l=24_h=8-32.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 400 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt b/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..a656751010780861cf7cc86f59332c5f9d5cadcb --- /dev/null +++ b/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47491e930f7b960ff38ba1c65014bfdbdd396029b6edc09ea10651b30502008e +size 614923196 diff --git a/rpj-d=576_l=24_h=8-4.0/params.txt b/rpj-d=576_l=24_h=8-4.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..6c17b0187cf108411dd9424f915e5afb44533d07 --- /dev/null +++ b/rpj-d=576_l=24_h=8-4.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rpj-d=576_l=24_h=8-4.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rpj-d=576_l=24_h=8-4.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=576_l=24_h=8 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rpj-d=576_l=24_h=8-4.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 400 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt b/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..29fe18b3bcdf8536bc2e61cdeae42eb272374948 --- /dev/null +++ b/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5a623e2ea67dfbd52632b81a31a0cf12b8b91e32ad716903d6f9e3651fa92fa +size 614923196 diff --git a/rpj-d=576_l=24_h=8-8.0/params.txt b/rpj-d=576_l=24_h=8-8.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..b45b3c661adf7548f90d9dd7f0e25ffa598d6ceb --- /dev/null +++ b/rpj-d=576_l=24_h=8-8.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rpj-d=576_l=24_h=8-8.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rpj-d=576_l=24_h=8-8.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=576_l=24_h=8 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rpj-d=576_l=24_h=8-8.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 400 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt b/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt new file mode 100644 index 0000000000000000000000000000000000000000..2b844181f31d88d76d2296d990d5c8b44186cb7e --- /dev/null +++ b/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d20380a5ae1eba3e20421c52d614e6a5b554e8c255348325fd4d4b41f83e836 +size 42317749 diff --git a/rpj-d=96_l=8_h=4-0.25/params.txt b/rpj-d=96_l=8_h=4-0.25/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..5697778fe4acb01d51fa86911076ef75872e1e5c --- /dev/null +++ b/rpj-d=96_l=8_h=4-0.25/params.txt @@ -0,0 +1,123 @@ +accum_freq: 1 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rpj-d=96_l=8_h=4-0.25/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 128 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rpj-d=96_l=8_h=4-0.25/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=96_l=8_h=4 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rpj-d=96_l=8_h=4-0.25 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 16 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 100 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt b/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt new file mode 100644 index 0000000000000000000000000000000000000000..f00a4c9fd3bec7f0c726ee5754b7a19bf2aa3c6a --- /dev/null +++ b/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cecf51a35328418d6af356b4704f241716e24efdd61b683491f61007586c1ad8 +size 42317749 diff --git a/rpj-d=96_l=8_h=4-0.5/params.txt b/rpj-d=96_l=8_h=4-0.5/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..3415cd704da790706e10e7ed5b57334f2cb9797c --- /dev/null +++ b/rpj-d=96_l=8_h=4-0.5/params.txt @@ -0,0 +1,123 @@ +accum_freq: 1 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rpj-d=96_l=8_h=4-0.5/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 128 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rpj-d=96_l=8_h=4-0.5/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=96_l=8_h=4 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rpj-d=96_l=8_h=4-0.5 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 16 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 100 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt b/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt new file mode 100644 index 0000000000000000000000000000000000000000..ae2f979a19d2de5e9f266634a188c66b712fa4e1 --- /dev/null +++ b/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2be6476ffdf75cc60a3b7a45a8972b244cf837716eb84509b9acce962a257c6 +size 42317749 diff --git a/rpj-d=96_l=8_h=4-1.0/params.txt b/rpj-d=96_l=8_h=4-1.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..ef6f8d43a9ab74c55e0dd8ed693bcda4b88953b0 --- /dev/null +++ b/rpj-d=96_l=8_h=4-1.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 1 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rpj-d=96_l=8_h=4-1.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 128 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rpj-d=96_l=8_h=4-1.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=96_l=8_h=4 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rpj-d=96_l=8_h=4-1.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 16 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 100 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt b/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt new file mode 100644 index 0000000000000000000000000000000000000000..b49a45055b95ea872b9e865e150938ea938742ed --- /dev/null +++ b/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f72baf894902a01cf88b91ac54d0adba3482a736df8f64c0a3a7b4edee82346d +size 42317749 diff --git a/rpj-d=96_l=8_h=4-16.0/params.txt b/rpj-d=96_l=8_h=4-16.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..50c7c9cf4158767b2f584a11c4d0ba5906d3168a --- /dev/null +++ b/rpj-d=96_l=8_h=4-16.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 1 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rpj-d=96_l=8_h=4-16.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 128 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rpj-d=96_l=8_h=4-16.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=96_l=8_h=4 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rpj-d=96_l=8_h=4-16.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 16 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 100 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt b/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt new file mode 100644 index 0000000000000000000000000000000000000000..6fded335f82df05c57698bfa4c0f736821efd58c --- /dev/null +++ b/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7528eb28963855ade13d4fad645cf69c929c9230cc6e000252ea274feb29055 +size 42317749 diff --git a/rpj-d=96_l=8_h=4-2.0/params.txt b/rpj-d=96_l=8_h=4-2.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..381ebe590a69ac53013dacd36c2efff14887cafd --- /dev/null +++ b/rpj-d=96_l=8_h=4-2.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 1 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rpj-d=96_l=8_h=4-2.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 128 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rpj-d=96_l=8_h=4-2.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=96_l=8_h=4 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rpj-d=96_l=8_h=4-2.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 16 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 100 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt b/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..0471f4a3a06d096ab20727fdd32fd62b199284bf --- /dev/null +++ b/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b5bf8233e448dfd0aed90d3544646388a781cfec88585fca264cb7f79688135 +size 42317749 diff --git a/rpj-d=96_l=8_h=4-32.0/params.txt b/rpj-d=96_l=8_h=4-32.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..d519361348e62c7a7a14c36459cd795277272ca6 --- /dev/null +++ b/rpj-d=96_l=8_h=4-32.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 1 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rpj-d=96_l=8_h=4-32.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 128 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rpj-d=96_l=8_h=4-32.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=96_l=8_h=4 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rpj-d=96_l=8_h=4-32.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 16 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 100 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt b/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt new file mode 100644 index 0000000000000000000000000000000000000000..12ed21a51bf018be2623417d09f29fa3acc028fd --- /dev/null +++ b/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a98b54f1f83b6406d7cb62f80933f10213804ebe4f724c5432b2d2f465f1671b +size 42317749 diff --git a/rpj-d=96_l=8_h=4-4.0/params.txt b/rpj-d=96_l=8_h=4-4.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..7451873620f7a3138e34173d001219f74630d7c0 --- /dev/null +++ b/rpj-d=96_l=8_h=4-4.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 1 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rpj-d=96_l=8_h=4-4.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 128 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rpj-d=96_l=8_h=4-4.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=96_l=8_h=4 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rpj-d=96_l=8_h=4-4.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 16 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 100 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt b/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt new file mode 100644 index 0000000000000000000000000000000000000000..b5abee37f19bf04220a483ff669efaada05f2257 --- /dev/null +++ b/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9184d93765c38fad0a356b76fd75e948542ae53603dd13a52b71671ac469ab7d +size 42317749 diff --git a/rpj-d=96_l=8_h=4-8.0/params.txt b/rpj-d=96_l=8_h=4-8.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..44efc58849f96f06ca5d755e8d2bc42dda196282 --- /dev/null +++ b/rpj-d=96_l=8_h=4-8.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 1 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rpj-d=96_l=8_h=4-8.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 128 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rpj-d=96_l=8_h=4-8.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=96_l=8_h=4 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rpj-d=96_l=8_h=4-8.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 16 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 100 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rpj-open_lm_1b-1.0/checkpoints/epoch_6.pt b/rpj-open_lm_1b-1.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..28c9c6a558e8c67f55cc71f1db24147cc7f5fe27 --- /dev/null +++ b/rpj-open_lm_1b-1.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b1ea413ca37c589f5ea943bbc6b3a6d8edee21201bb5e97041009574d222c6e +size 5759681220 diff --git a/rpj-open_lm_1b-1.0/params.txt b/rpj-open_lm_1b-1.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..2ea21268e7a51d069569c62579e3ddcd0bdc45a6 --- /dev/null +++ b/rpj-open_lm_1b-1.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 2 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rpj-open_lm_1b-1.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 64 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rpj-open_lm_1b-1.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: open_lm_1b +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rpj-open_lm_1b-1.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 8 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rpj-open_lm_1b-1.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/de-en/val_de-en_000.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_010.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_020.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_030.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_040.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_050.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_060.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_070.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_080.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_090.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_100.tar'] +val_data_key: ['json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 5000 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rpj-open_lm_1b-32.0/checkpoints/epoch_6.pt b/rpj-open_lm_1b-32.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..0a7c0e8cdbabcac8ce886550f2a3e534eb6800bc --- /dev/null +++ b/rpj-open_lm_1b-32.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1a71b39d667609f03b3b85cc9fbcf2be1ebf8587f4c147bb18d29e5c88b1287 +size 5759680516 diff --git a/rpj-open_lm_1b-32.0/params.txt b/rpj-open_lm_1b-32.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..1faffd5ba0a62f403c9032f294f0957e096667db --- /dev/null +++ b/rpj-open_lm_1b-32.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 1 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: logs/11578/rpj-open_lm_1b-32.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 128 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: logs/11578/rpj-open_lm_1b-32.0/out.log +logs: logs/11578 +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: open_lm_1b +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rpj-open_lm_1b-32.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 16 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rpj-open_lm_1b-32.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/de-en/val_de-en_000.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_010.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_020.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_030.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_040.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_050.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_060.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_070.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_080.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_090.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_100.tar'] +val_data_key: ['json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 5000 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt b/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt new file mode 100644 index 0000000000000000000000000000000000000000..56ac68cc02d922970fba788682fddceafdcd513e --- /dev/null +++ b/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:593f0f5f68ea3557d90bb7cb3f0864c7088f8f2887ded61622c3f59b68f8bd8f +size 1646767740 diff --git a/rw_original-d=1024_l=24_h=8-0.25/params.txt b/rw_original-d=1024_l=24_h=8-0.25/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..a265f280d6ecde82197249cd422f026e86f7aa66 --- /dev/null +++ b/rw_original-d=1024_l=24_h=8-0.25/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rw_original-d=1024_l=24_h=8-0.25/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rw_original-d=1024_l=24_h=8-0.25/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=1024_l=24_h=8 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rw_original-d=1024_l=24_h=8-0.25 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 2000 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt b/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..8590f92bc3edf00a3687118a9693d8a894f11899 --- /dev/null +++ b/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:785f4a612748e444275acd4931ba762126060ec95026cab2291632f3abafdddb +size 1646767740 diff --git a/rw_original-d=1024_l=24_h=8-0.5/params.txt b/rw_original-d=1024_l=24_h=8-0.5/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..b5cb74aad46e4000c9563f75ed5329aaadd81e82 --- /dev/null +++ b/rw_original-d=1024_l=24_h=8-0.5/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rw_original-d=1024_l=24_h=8-0.5/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rw_original-d=1024_l=24_h=8-0.5/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=1024_l=24_h=8 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rw_original-d=1024_l=24_h=8-0.5 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 2000 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt b/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..fedba709f79fb486b3330952f13b6d690c870e2f --- /dev/null +++ b/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b813cf654337c461585ca95e161a2690f21f906dd0c905469f466944af264f39 +size 1646767740 diff --git a/rw_original-d=1024_l=24_h=8-1.0/params.txt b/rw_original-d=1024_l=24_h=8-1.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..a66d90f8e1ab4321c5ae58777c0966e38a4da6e3 --- /dev/null +++ b/rw_original-d=1024_l=24_h=8-1.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rw_original-d=1024_l=24_h=8-1.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rw_original-d=1024_l=24_h=8-1.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=1024_l=24_h=8 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rw_original-d=1024_l=24_h=8-1.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 2000 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt b/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..806114d2835832e27302f6afedf7a81cfbb0734d --- /dev/null +++ b/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d46fb8fc30e46918954b0427a3c0c1a3da9cfeef14586243905fe99bf0722f94 +size 1646767804 diff --git a/rw_original-d=1024_l=24_h=8-16.0/params.txt b/rw_original-d=1024_l=24_h=8-16.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..d1620c20677e5441ab474ad16c15d1e492e6d55f --- /dev/null +++ b/rw_original-d=1024_l=24_h=8-16.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 2 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /tmp/achal-dave-openlm-dcnlp_2024-01-26-08-26-07-183/rw_original-d=1024_l=24_h=8-16.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 64 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /tmp/achal-dave-openlm-dcnlp_2024-01-26-08-26-07-183/rw_original-d=1024_l=24_h=8-16.0/out.log +logs: /tmp/achal-dave-openlm-dcnlp_2024-01-26-08-26-07-183 +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=1024_l=24_h=8 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rw_original-d=1024_l=24_h=8-16.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 8 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 2000 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt b/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..ad462af7c10ee9221a15863663b2c12822699a8b --- /dev/null +++ b/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c94f325cc72b357a8ef1a2448767b89afd1060b901d5028faaeeb2d8a640c75 +size 1646767740 diff --git a/rw_original-d=1024_l=24_h=8-2.0/params.txt b/rw_original-d=1024_l=24_h=8-2.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..8db3cb0394f3230e1e13f774eb35173a73e08438 --- /dev/null +++ b/rw_original-d=1024_l=24_h=8-2.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rw_original-d=1024_l=24_h=8-2.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rw_original-d=1024_l=24_h=8-2.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=1024_l=24_h=8 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rw_original-d=1024_l=24_h=8-2.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 2000 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt b/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..f436d72fca5987ae9049e7edf548e3096e96ffb3 --- /dev/null +++ b/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f31ea99175da2b881816745092676f30b2a12d8e98fbfc27355a1bfda846c4e8 +size 1646767804 diff --git a/rw_original-d=1024_l=24_h=8-32.0/params.txt b/rw_original-d=1024_l=24_h=8-32.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..8284a34442e7977166f3059ce6d4e29555241c98 --- /dev/null +++ b/rw_original-d=1024_l=24_h=8-32.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 2 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /tmp/achal-dave-openlm-dcnlp_2024-01-26-08-25-53-415/rw_original-d=1024_l=24_h=8-32.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 64 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /tmp/achal-dave-openlm-dcnlp_2024-01-26-08-25-53-415/rw_original-d=1024_l=24_h=8-32.0/out.log +logs: /tmp/achal-dave-openlm-dcnlp_2024-01-26-08-25-53-415 +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=1024_l=24_h=8 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rw_original-d=1024_l=24_h=8-32.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 8 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 2000 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt b/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..9c4dbb8472034c9d55cd9e5c197be8eaba181e6a --- /dev/null +++ b/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1bec20f891478f67a2262107fd4a7eb93a440e090800443590515392d2e7fca7 +size 1646767740 diff --git a/rw_original-d=1024_l=24_h=8-4.0/params.txt b/rw_original-d=1024_l=24_h=8-4.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..19997c99d5a347f99f7caf8f778c08db0ae19f53 --- /dev/null +++ b/rw_original-d=1024_l=24_h=8-4.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rw_original-d=1024_l=24_h=8-4.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rw_original-d=1024_l=24_h=8-4.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=1024_l=24_h=8 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rw_original-d=1024_l=24_h=8-4.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 2000 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt b/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..12e1ccff98f03ea02ad900f67e317a5b5e74ff29 --- /dev/null +++ b/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4c0a73b95a7c623755b2e0c3f68c95415e4e137de600cf8f06d0ff4e8eb35d6 +size 1646767740 diff --git a/rw_original-d=1024_l=24_h=8-8.0/params.txt b/rw_original-d=1024_l=24_h=8-8.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..12939d47a1f1fc22aef0b9b5ba62c0088e93acee --- /dev/null +++ b/rw_original-d=1024_l=24_h=8-8.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rw_original-d=1024_l=24_h=8-8.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rw_original-d=1024_l=24_h=8-8.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=1024_l=24_h=8 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rw_original-d=1024_l=24_h=8-8.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 2000 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt b/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt new file mode 100644 index 0000000000000000000000000000000000000000..b89518818e98f93f93e91dab3333d809b48c1560 --- /dev/null +++ b/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a892795a4ddc1ef4799d809876b2f8807f1a591cd690e357c507285a139ca9a3 +size 315725493 diff --git a/rw_original-d=512_l=8_h=4-0.25/params.txt b/rw_original-d=512_l=8_h=4-0.25/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..beddfdc7d620dec9cde2e90e1de2400b3669551a --- /dev/null +++ b/rw_original-d=512_l=8_h=4-0.25/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rw_original-d=512_l=8_h=4-0.25/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rw_original-d=512_l=8_h=4-0.25/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=512_l=8_h=4 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rw_original-d=512_l=8_h=4-0.25 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 400 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt b/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt new file mode 100644 index 0000000000000000000000000000000000000000..9f4d78bc58fa266d40f602582c83cbfb6f575c81 --- /dev/null +++ b/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3dc8a18efd1d0a389ca6dfb601c449d93ac8b4a6a7399720b9d86f9309610af4 +size 315725493 diff --git a/rw_original-d=512_l=8_h=4-0.5/params.txt b/rw_original-d=512_l=8_h=4-0.5/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..64c67fbb86deeb74f45cb4767f8afedd7b1b80be --- /dev/null +++ b/rw_original-d=512_l=8_h=4-0.5/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rw_original-d=512_l=8_h=4-0.5/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rw_original-d=512_l=8_h=4-0.5/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=512_l=8_h=4 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rw_original-d=512_l=8_h=4-0.5 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 400 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt b/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt new file mode 100644 index 0000000000000000000000000000000000000000..98cbd522d7edc5627d8268589046d63aa5100ed5 --- /dev/null +++ b/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a6af25076b362a527095ab719cc99dd92b9053124b1206c3d48a440fac9fa5f +size 315725557 diff --git a/rw_original-d=512_l=8_h=4-1.0/params.txt b/rw_original-d=512_l=8_h=4-1.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..6df82310a5d2a1cf63d843b8c17afcc9c1ea8018 --- /dev/null +++ b/rw_original-d=512_l=8_h=4-1.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rw_original-d=512_l=8_h=4-1.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rw_original-d=512_l=8_h=4-1.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=512_l=8_h=4 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rw_original-d=512_l=8_h=4-1.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 400 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt b/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..e9c7700002a8b719691d6a8dff53dc3101d8152b --- /dev/null +++ b/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e01af896560dd2d24abf4fe873edd7410e484526e0fe3bb5bf6fffd5b3555ca4 +size 315725557 diff --git a/rw_original-d=512_l=8_h=4-16.0/params.txt b/rw_original-d=512_l=8_h=4-16.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..b3b59010146428921863c548e4627ee3870eb579 --- /dev/null +++ b/rw_original-d=512_l=8_h=4-16.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rw_original-d=512_l=8_h=4-16.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rw_original-d=512_l=8_h=4-16.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=512_l=8_h=4 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rw_original-d=512_l=8_h=4-16.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 400 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt b/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt new file mode 100644 index 0000000000000000000000000000000000000000..1e3190dea264621259ec92f8a7abd56f035f9b9e --- /dev/null +++ b/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b241ff18f560f46c99cecda38fd8876472cb4524805ef21ad55032474c7149d +size 315725557 diff --git a/rw_original-d=512_l=8_h=4-2.0/params.txt b/rw_original-d=512_l=8_h=4-2.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..1ba72defca0be2342369ac954b193863409068d6 --- /dev/null +++ b/rw_original-d=512_l=8_h=4-2.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rw_original-d=512_l=8_h=4-2.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rw_original-d=512_l=8_h=4-2.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=512_l=8_h=4 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rw_original-d=512_l=8_h=4-2.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 400 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rw_original-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt b/rw_original-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..20982a110e4f6f5aaaacc8322fd7d8e49aad5307 --- /dev/null +++ b/rw_original-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99095c282bf7908c15afdce0bca73e5e27fdd98e0eff301be3a3547af1657ab0 +size 315724789 diff --git a/rw_original-d=512_l=8_h=4-32.0/params.txt b/rw_original-d=512_l=8_h=4-32.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..16e0b15b71ac242411b60d017d0999a8129e02cb --- /dev/null +++ b/rw_original-d=512_l=8_h=4-32.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /tmp/achal-dave-openlm-dcnlp_2024-01-31-12-48-45-433/rw_original-d=512_l=8_h=4-32.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /tmp/achal-dave-openlm-dcnlp_2024-01-31-12-48-45-433/rw_original-d=512_l=8_h=4-32.0/out.log +logs: /tmp/achal-dave-openlm-dcnlp_2024-01-31-12-48-45-433 +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=512_l=8_h=4 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rw_original-d=512_l=8_h=4-32.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rw_original-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 400 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt b/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt new file mode 100644 index 0000000000000000000000000000000000000000..3de28d8076a5fe894abf64d8a121c29a75b1e4b6 --- /dev/null +++ b/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9029d2e1e2d4f7b4105d3867f09c3272c6d8a0cdbdc7336fcade02face7f8cd8 +size 315725557 diff --git a/rw_original-d=512_l=8_h=4-4.0/params.txt b/rw_original-d=512_l=8_h=4-4.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..8f29003bca779ec3cfa51eef255d3497aaa1ccbb --- /dev/null +++ b/rw_original-d=512_l=8_h=4-4.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rw_original-d=512_l=8_h=4-4.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rw_original-d=512_l=8_h=4-4.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=512_l=8_h=4 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rw_original-d=512_l=8_h=4-4.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 400 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt b/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..2e1fe91cd3d8124af38302db6a5c7b0fa0ec0fa2 --- /dev/null +++ b/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3df57532c9976d46aef52ae86613052555223acabca6c9e561d8cf7704986683 +size 315725557 diff --git a/rw_original-d=512_l=8_h=4-8.0/params.txt b/rw_original-d=512_l=8_h=4-8.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..f47abcc916301b1e9f8de805232d639a9a923fbe --- /dev/null +++ b/rw_original-d=512_l=8_h=4-8.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rw_original-d=512_l=8_h=4-8.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rw_original-d=512_l=8_h=4-8.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=512_l=8_h=4 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rw_original-d=512_l=8_h=4-8.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 400 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt b/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt new file mode 100644 index 0000000000000000000000000000000000000000..e7e5a53365c738e96a7b78e02ee72f18c2d1bcf1 --- /dev/null +++ b/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3b878bc9d9d9cd266cfcb7ff8ea977aa9c5852c1d06fd2215b105b95be8d330 +size 614923196 diff --git a/rw_original-d=576_l=24_h=8-0.25/params.txt b/rw_original-d=576_l=24_h=8-0.25/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..2f81b8e245c1b8648439a08beea5fd0be13df28b --- /dev/null +++ b/rw_original-d=576_l=24_h=8-0.25/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rw_original-d=576_l=24_h=8-0.25/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rw_original-d=576_l=24_h=8-0.25/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=576_l=24_h=8 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rw_original-d=576_l=24_h=8-0.25 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 400 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt b/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..51566c9c0d1a5388e03fdb7ed51e9eaf50e0668e --- /dev/null +++ b/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a86d697ead8145673ae8448c899a67b3d35db8c244770f110aa0924936f1cfcf +size 614923196 diff --git a/rw_original-d=576_l=24_h=8-0.5/params.txt b/rw_original-d=576_l=24_h=8-0.5/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..02589d400a928daec5a3e5d238a56011228f32a6 --- /dev/null +++ b/rw_original-d=576_l=24_h=8-0.5/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rw_original-d=576_l=24_h=8-0.5/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rw_original-d=576_l=24_h=8-0.5/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=576_l=24_h=8 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rw_original-d=576_l=24_h=8-0.5 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 400 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt b/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..77e732c6563432db2aa17aafaee79a125e8352fe --- /dev/null +++ b/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9796f29dbee9e01a6795f4b8e6e8d0367e908e55e8bb69f0f5573b118036c75 +size 614923196 diff --git a/rw_original-d=576_l=24_h=8-1.0/params.txt b/rw_original-d=576_l=24_h=8-1.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..a24bcefa5ceac8343d5717eb37b9dd456b58ead1 --- /dev/null +++ b/rw_original-d=576_l=24_h=8-1.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rw_original-d=576_l=24_h=8-1.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rw_original-d=576_l=24_h=8-1.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=576_l=24_h=8 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rw_original-d=576_l=24_h=8-1.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 400 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt b/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..555fa5e307ccadfdef9a7db2594534a9096e85f8 --- /dev/null +++ b/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01ce0b6c39f8667327a2a4720e3c280ce5139218ca5252ee04f5c62496f7b67b +size 614923196 diff --git a/rw_original-d=576_l=24_h=8-16.0/params.txt b/rw_original-d=576_l=24_h=8-16.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..3b03727cf8a2ae31ea83457cf0a0e89e9a5fb0a4 --- /dev/null +++ b/rw_original-d=576_l=24_h=8-16.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rw_original-d=576_l=24_h=8-16.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rw_original-d=576_l=24_h=8-16.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=576_l=24_h=8 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rw_original-d=576_l=24_h=8-16.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 400 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt b/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt new file mode 100644 index 0000000000000000000000000000000000000000..cbd3da99dc3600ed90e984bb1943374a8fc993a0 --- /dev/null +++ b/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:139317705f6707b76fbb2255e6da925812ecf63ec2294892f0966e56e376eb95 +size 614923196 diff --git a/rw_original-d=576_l=24_h=8-2.0/params.txt b/rw_original-d=576_l=24_h=8-2.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..145a61866da19e0c7dd6c1ee19bea5856295adb9 --- /dev/null +++ b/rw_original-d=576_l=24_h=8-2.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rw_original-d=576_l=24_h=8-2.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rw_original-d=576_l=24_h=8-2.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=576_l=24_h=8 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rw_original-d=576_l=24_h=8-2.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 400 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt b/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..b23610d769d18a5bccd350b5528c9ab48d5fdc1a --- /dev/null +++ b/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2989ad86cd0758f36972baca37438cb5664e275a636e4d237fb5ed993b6f7913 +size 614923196 diff --git a/rw_original-d=576_l=24_h=8-32.0/params.txt b/rw_original-d=576_l=24_h=8-32.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..da718c58ad74590e1f13258e04096e3edcdb4a72 --- /dev/null +++ b/rw_original-d=576_l=24_h=8-32.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rw_original-d=576_l=24_h=8-32.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rw_original-d=576_l=24_h=8-32.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=576_l=24_h=8 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rw_original-d=576_l=24_h=8-32.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 400 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt b/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..d875885fc19cac7d29a0256f5df6aedfa6fd5adc --- /dev/null +++ b/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:647542d2b420364ca8fa93a3e00fd2eb78c4423edb3946108f5c67ccc0cf26b0 +size 614922428 diff --git a/rw_original-d=576_l=24_h=8-4.0/params.txt b/rw_original-d=576_l=24_h=8-4.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..8f637478bf63dcda1524fe02de1913567fe25806 --- /dev/null +++ b/rw_original-d=576_l=24_h=8-4.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rw_original-d=576_l=24_h=8-4.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rw_original-d=576_l=24_h=8-4.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=576_l=24_h=8 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rw_original-d=576_l=24_h=8-4.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 400 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt b/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..7167f6a584aa1ebd30022c204dcf4de2ca61ab66 --- /dev/null +++ b/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:565fa5ba665cb085d4c1b73f823d5b229422fcd43548e8c220433955b4700bf3 +size 614922428 diff --git a/rw_original-d=576_l=24_h=8-8.0/params.txt b/rw_original-d=576_l=24_h=8-8.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..486ef4cb16b638fb2664c1667b42d039eb6a7b3c --- /dev/null +++ b/rw_original-d=576_l=24_h=8-8.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 8 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rw_original-d=576_l=24_h=8-8.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 16 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rw_original-d=576_l=24_h=8-8.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=576_l=24_h=8 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rw_original-d=576_l=24_h=8-8.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 2 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 400 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt b/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt new file mode 100644 index 0000000000000000000000000000000000000000..5b586fd0738283752cc0c7c163f2aef70ebc6c9b --- /dev/null +++ b/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fdd216e78d811fc04d7704cbe8e370e8eb2fea9eef2c1a152fc0168aef67ef57 +size 42317749 diff --git a/rw_original-d=96_l=8_h=4-0.25/params.txt b/rw_original-d=96_l=8_h=4-0.25/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..1cef0b30d79497bff0f85aaf8135d3cb5c36da0a --- /dev/null +++ b/rw_original-d=96_l=8_h=4-0.25/params.txt @@ -0,0 +1,123 @@ +accum_freq: 1 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rw_original-d=96_l=8_h=4-0.25/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 128 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rw_original-d=96_l=8_h=4-0.25/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=96_l=8_h=4 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rw_original-d=96_l=8_h=4-0.25 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 16 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 100 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt b/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt new file mode 100644 index 0000000000000000000000000000000000000000..9660493b5f924879ab3ee2bb32bb9a58c3b821e0 --- /dev/null +++ b/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01242328d6d8bccb68342800bc192461255380530cdad0f8a637d96f71e8cda4 +size 42317749 diff --git a/rw_original-d=96_l=8_h=4-0.5/params.txt b/rw_original-d=96_l=8_h=4-0.5/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..c9ad52a40eb4fd191921f6c91cec35d89addda64 --- /dev/null +++ b/rw_original-d=96_l=8_h=4-0.5/params.txt @@ -0,0 +1,123 @@ +accum_freq: 1 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rw_original-d=96_l=8_h=4-0.5/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 128 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rw_original-d=96_l=8_h=4-0.5/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=96_l=8_h=4 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rw_original-d=96_l=8_h=4-0.5 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 16 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 100 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt b/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt new file mode 100644 index 0000000000000000000000000000000000000000..c5c1e99b0ecfac7e9f9dc0a4813377df8abf8061 --- /dev/null +++ b/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d261a45118db0f31dae4cc975c098d41a0a0338d0251c40b897b1174151a23df +size 42317749 diff --git a/rw_original-d=96_l=8_h=4-1.0/params.txt b/rw_original-d=96_l=8_h=4-1.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..6b06b02f78594ccc2430e01e776a43b8993f5f29 --- /dev/null +++ b/rw_original-d=96_l=8_h=4-1.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 1 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rw_original-d=96_l=8_h=4-1.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 128 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rw_original-d=96_l=8_h=4-1.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=96_l=8_h=4 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rw_original-d=96_l=8_h=4-1.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 16 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 100 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt b/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt new file mode 100644 index 0000000000000000000000000000000000000000..bbba0152224c41221b9eb9ca6a82cd033758b63a --- /dev/null +++ b/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c931b5fb6b7a13f5b291cf92a0a7edd849424ced0100cc509eb8dc8d12a7077 +size 42317749 diff --git a/rw_original-d=96_l=8_h=4-16.0/params.txt b/rw_original-d=96_l=8_h=4-16.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..f40fb820cdfeb5279f62b308793bb1e01d2ada30 --- /dev/null +++ b/rw_original-d=96_l=8_h=4-16.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 1 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rw_original-d=96_l=8_h=4-16.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 128 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rw_original-d=96_l=8_h=4-16.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=96_l=8_h=4 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rw_original-d=96_l=8_h=4-16.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 16 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 100 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt b/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt new file mode 100644 index 0000000000000000000000000000000000000000..b9898d647a055018a41a7b00b9e1263b68de2337 --- /dev/null +++ b/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b59bc2200fcc600de49ccc91f10fad8c322a41eb687a01843abde2980a0e006a +size 42317749 diff --git a/rw_original-d=96_l=8_h=4-2.0/params.txt b/rw_original-d=96_l=8_h=4-2.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..f5de8c4699a992b47cb84e4bb0b2d258278e0c22 --- /dev/null +++ b/rw_original-d=96_l=8_h=4-2.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 1 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rw_original-d=96_l=8_h=4-2.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 128 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rw_original-d=96_l=8_h=4-2.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=96_l=8_h=4 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rw_original-d=96_l=8_h=4-2.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 16 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 100 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt b/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..d24451ab2c56c2f4191c4c68f379db4052b882cb --- /dev/null +++ b/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:812c57410d203680efb7a79aabcd55a545c71a917d5efdbe06bdce109150bf89 +size 42317749 diff --git a/rw_original-d=96_l=8_h=4-32.0/params.txt b/rw_original-d=96_l=8_h=4-32.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..c908b8729f367a4fadafca434aa5c00d31c68bee --- /dev/null +++ b/rw_original-d=96_l=8_h=4-32.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 1 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rw_original-d=96_l=8_h=4-32.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 128 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rw_original-d=96_l=8_h=4-32.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=96_l=8_h=4 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rw_original-d=96_l=8_h=4-32.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 16 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 100 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt b/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt new file mode 100644 index 0000000000000000000000000000000000000000..cff1e7f606de269e66e4f5b29f80b5235d63e5a9 --- /dev/null +++ b/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd8e74c3e4f1d7b550012b4d898c7bc9cc9b049b26bcd4701ad3d786068be7d0 +size 42317749 diff --git a/rw_original-d=96_l=8_h=4-4.0/params.txt b/rw_original-d=96_l=8_h=4-4.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..d0f1117c1b2312a3da35afe74f1cac8b6af4dfd2 --- /dev/null +++ b/rw_original-d=96_l=8_h=4-4.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 1 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rw_original-d=96_l=8_h=4-4.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 128 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rw_original-d=96_l=8_h=4-4.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=96_l=8_h=4 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rw_original-d=96_l=8_h=4-4.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 16 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 100 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt b/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt new file mode 100644 index 0000000000000000000000000000000000000000..d8e49b3a4c32aa2f6775a7955aa8687013cacd43 --- /dev/null +++ b/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f75bb692ccdbc9fa1f190d3419944da7d18d9865f46e4ab77bdecb3c57bf37d6 +size 42317749 diff --git a/rw_original-d=96_l=8_h=4-8.0/params.txt b/rw_original-d=96_l=8_h=4-8.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..7449d5c245a593d0b1032e0877d608e9e49c35b0 --- /dev/null +++ b/rw_original-d=96_l=8_h=4-8.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 1 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rw_original-d=96_l=8_h=4-8.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 128 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rw_original-d=96_l=8_h=4-8.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: d=96_l=8_h=4 +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rw_original-d=96_l=8_h=4-8.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 16 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] +val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 100 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rw_original-open_lm_1b-1.0/checkpoints/epoch_7.pt b/rw_original-open_lm_1b-1.0/checkpoints/epoch_7.pt new file mode 100644 index 0000000000000000000000000000000000000000..436d651b7e650b0a18f7eb37fd91d3fd9397ce05 --- /dev/null +++ b/rw_original-open_lm_1b-1.0/checkpoints/epoch_7.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bce83a65a4bbe16e75513987f5a161ba4bc18a774e043edcbd55d9ef2b3af26 +size 5759681220 diff --git a/rw_original-open_lm_1b-1.0/params.txt b/rw_original-open_lm_1b-1.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..5f14f6f182f3a3adb76da08883a917ea46c23990 --- /dev/null +++ b/rw_original-open_lm_1b-1.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 2 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: /admin/home-sy/dcnlp_logs/rw_original-open_lm_1b-1.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 64 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: /admin/home-sy/dcnlp_logs/rw_original-open_lm_1b-1.0/out.log +logs: /admin/home-sy/dcnlp_logs +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: open_lm_1b +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rw_original-open_lm_1b-1.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 8 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rw_original-open_lm_1b-1.0/checkpoints/epoch_7.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/de-en/val_de-en_000.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_010.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_020.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_030.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_040.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_050.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_060.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_070.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_080.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_090.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_100.tar'] +val_data_key: ['json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 5000 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001 diff --git a/rw_original-open_lm_1b-16.0/checkpoints/epoch_6.pt b/rw_original-open_lm_1b-16.0/checkpoints/epoch_6.pt new file mode 100644 index 0000000000000000000000000000000000000000..727fa455315358e37da9ab49afd2b723222116ec --- /dev/null +++ b/rw_original-open_lm_1b-16.0/checkpoints/epoch_6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f98a99a68d6fd717cff9b51fe9bd5cb37c6470356485341ceaf5385385ef6f80 +size 5759680452 diff --git a/rw_original-open_lm_1b-16.0/params.txt b/rw_original-open_lm_1b-16.0/params.txt new file mode 100644 index 0000000000000000000000000000000000000000..b742f86ed8c9a46b761b5571d3d7f3f0d6a330c2 --- /dev/null +++ b/rw_original-open_lm_1b-16.0/params.txt @@ -0,0 +1,123 @@ +accum_freq: 1 +attn_activation: None +attn_name: auto +attn_seq_scalar: None +attn_seq_scalar_alpha: None +average: None +average_coefficients: None +beta1: 0.9 +beta2: 0.95 +checkpoint_path: ./logs/1765/rw_original-open_lm_1b-16.0/checkpoints +copy_codebase: False +data_key: txt +dataset_manifest: None +dataset_resampled: False +dataset_type: auto +ddp_static_graph: False +debug: False +delete_previous_checkpoint: True +device: cuda:0 +disable_buffer: False +dist_backend: nccl +dist_url: env:// +distill_model: None +distill_pretrained: None +distributed: True +epochs: 5 +epochs_cooldown: None +eps: 1e-08 +experimental_meta_device: False +ffn_type: swiglu +force_distributed: False +force_min_lr: 0.0 +fsdp: False +fsdp_amp: False +fsdp_backward_prefetch: False +fsdp_checkpoint: False +fsdp_cpu_offload: False +fsdp_hybrid: False +fsdp_hybrid_o2: False +fsdp_limit_all_gathers: False +fsdp_pure_bf16: False +fsdp_use_orig_params: False +global_batch_size: 128 +global_val_batch_size: 128 +grad_checkpointing: False +grad_clip_norm: 1.0 +hf_fsdp_block: None +hf_model: None +hf_seq_len: None +ignore_parse_errors: False +load_pretrained_state: False +local_rank: 0 +log_every_n_steps: 20 +log_level: 20 +log_local: False +log_logit_mean: False +log_path: ./logs/1765/rw_original-open_lm_1b-16.0/out.log +logs: ./logs/1765 +lr: 0.003 +lr_cooldown_end: 3e-05 +lr_cooldown_power: 1.0 +lr_scheduler: cosine +model: open_lm_1b +model_norm: gain_only_lp_layer_norm +moe_capacity_factor: 1.25 +moe_expert_model_parallelism: False +moe_freq: 0 +moe_loss_weight: 0.1 +moe_num_experts: None +moe_top_k: 2 +moe_weight_parallelism: False +multiple_data_passes: False +name: rw_original-open_lm_1b-16.0 +no_set_device_rank: False +optimizer: adamw +per_gpu_batch_size: 16 +per_gpu_val_batch_size: 16 +positional_embedding_type: rotary +precision: amp_bfloat16 +pretrained: None +qk_norm: True +rank: 0 +remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 +remote_sync_frequency: 300 +remote_sync_protocol: s3 +report_to: +resume: s3://dcnlp-west/dcnlp_experiments_v3/rw_original-open_lm_1b-16.0/checkpoints/epoch_6.pt +save_frequency: 1 +save_most_recent: False +seed: 124 +seq_len: 2048 +skip_scheduler: False +squash_mask_left: True +target_mask_individual: 50400 +target_mask_left: 50300 +tensorboard: False +tensorboard_path: +torchcompile: False +torchscript: False +trace: False +train_data: None +train_data_mix_weights: None +train_data_upsampling_factors: None +train_num_samples: None +use_bn_sync: False +use_bnb_linear: None +val_data: ['training/eval_data/val_tok_mult/de-en/val_de-en_000.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_010.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_020.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_030.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_040.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_050.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_060.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_070.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_080.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_090.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_100.tar'] +val_data_key: ['json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz'] +val_frequency: 5 +val_iter_ci: 10000 +val_max_pop_ci: 300000 +val_num_samples: None +val_seq_ci: True +val_tok_ci: True +vocab_size: 50432 +wandb: False +wandb_notes: +wandb_project_name: open-lm +warmup: 5000 +wd: 0.033 +workers: 2 +world_size: 8 +z_loss_coefficient: 0.0001