--- license: apache-2.0 --- New run with DeepSpeed 0.7.7 and transformers 4.26.1, (using WarmupDecayLR, probably not optimal). wandb run: https://wandb.ai/open-assistant/supervised-finetuning/runs/bxyaxo4v Checkpoint: 2000 steps ~48% 1st epoch Datasets: ``` pretrain: use_custom_sampler: true sort_by_length: false datasets: - joke - webgpt: val_split: 0.1 - gpt4all: val_split: 0.01 - alpaca: val_split: 0.025 - code_alpaca: val_split: 0.05 - minimath - humaneval_mbpp_codegen_qa - humaneval_mbpp_testgen_qa - grade_school_math_instructions - recipes - cmu_wiki_qa #- youtube_subs_howto100m # uses incompatible column names #- ubuntu_dialogue_qa # fails to load - oa_wiki_qa_bart_10000row - prosocial_dialogue: fraction: 0.1 - explain_prosocial: fraction: 0.05 ``` Pythia: ``` pythia-12b: dtype: fp16 log_dir: "pythia_log_12b" learning_rate: 6e-6 model_name: EleutherAI/pythia-12b-deduped output_dir: pythia_model_12b weight_decay: 0.0 max_length: 2048 use_flash_attention: true deepspeed_config: configs/zero_conf2.json warmup_steps: 50 gradient_checkpointing: true gradient_accumulation_steps: 2 per_device_train_batch_size: 8 per_device_eval_batch_size: 5 eval_steps: 200 save_steps: 500 num_train_epochs: 2 save_total_limit: 2 ``` zero_conf2.json: ``` { "fp16": { "enabled": "auto", "loss_scale": 0, "loss_scale_window": 1000, "initial_scale_power": 16, "hysteresis": 2, "min_loss_scale": 1 }, "bf16": { "enabled": "auto" }, "optimizer": { "type": "AdamW", "params": { "lr": "auto", "betas": "auto", "eps": "auto", "weight_decay": "auto" } }, "scheduler": { "type": "WarmupDecayLR", "params": { "warmup_min_lr": "auto", "warmup_max_lr": "auto", "warmup_num_steps": "auto", "total_num_steps": "auto" } }, "zero_optimization": { "stage": 2, "allgather_partitions": true, "allgather_bucket_size": 1000000000.0, "overlap_comm": false, "reduce_scatter": true, "reduce_bucket_size": 1000000000.0, "contiguous_gradients": true }, "gradient_accumulation_steps": "auto", "gradient_clipping": "auto", "steps_per_print": 2000, "train_batch_size": "auto", "train_micro_batch_size_per_gpu": "auto", "wall_clock_breakdown": false } ```