base_model: theblackcat102/whale-v3-base-merged # optionally might have model_type or tokenizer_type model_type: AutoModelForCausalLM tokenizer_type: AutoTokenizer trust_remote_code: true # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name load_in_8bit: false load_in_4bit: true strict: false adapter: qlora lora_r: 380 lora_alpha: 380 lora_target_modules: [down_proj,gate_proj,up_proj] lora_target_linear: false peft_use_rslora: true max_steps: 100000 datasets: - path: cerebras/SlimPajama-627B data_files: - train/chunk1/example_train_0.jsonl.zst - train/chunk1/example_train_1.jsonl.zst - train/chunk1/example_train_2.jsonl.zst - train/chunk1/example_train_3.jsonl.zst - train/chunk1/example_train_4.jsonl.zst - train/chunk1/example_train_5.jsonl.zst - train/chunk1/example_train_6.jsonl.zst - train/chunk1/example_train_7.jsonl.zst - train/chunk1/example_train_8.jsonl.zst - train/chunk1/example_train_9.jsonl.zst - train/chunk1/example_train_10.jsonl.zst - train/chunk1/example_train_11.jsonl.zst - train/chunk1/example_train_12.jsonl.zst - train/chunk1/example_train_13.jsonl.zst - train/chunk1/example_train_14.jsonl.zst - train/chunk1/example_train_15.jsonl.zst - train/chunk1/example_train_16.jsonl.zst - train/chunk1/example_train_17.jsonl.zst - train/chunk1/example_train_18.jsonl.zst - train/chunk1/example_train_19.jsonl.zst - train/chunk1/example_train_20.jsonl.zst - train/chunk1/example_train_21.jsonl.zst - train/chunk1/example_train_22.jsonl.zst - train/chunk1/example_train_23.jsonl.zst - train/chunk1/example_train_24.jsonl.zst - train/chunk1/example_train_25.jsonl.zst - train/chunk1/example_train_26.jsonl.zst - train/chunk1/example_train_27.jsonl.zst - train/chunk1/example_train_28.jsonl.zst - train/chunk1/example_train_29.jsonl.zst - train/chunk1/example_train_30.jsonl.zst - train/chunk1/example_train_31.jsonl.zst - train/chunk1/example_train_32.jsonl.zst - train/chunk1/example_train_33.jsonl.zst - train/chunk1/example_train_34.jsonl.zst - train/chunk1/example_train_35.jsonl.zst - train/chunk1/example_train_36.jsonl.zst - train/chunk1/example_train_37.jsonl.zst - train/chunk1/example_train_38.jsonl.zst - train/chunk1/example_train_39.jsonl.zst - train/chunk1/example_train_40.jsonl.zst split: train type: completion dataset_prepared_path: last_run_mixed val_set_size: 0.0 output_dir: ./outputs/model-out-mix sequence_len: 1024 sample_packing: true # unfrozen_parameters: # - model.layers.*.mlp.(gate|up|down)_proj.weight$ wandb_project: whale-v3-post-pt wandb_entity: theblackcat102 wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 32 micro_batch_size: 1 num_epochs: 4 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.00008 max_grad_norm: 10.0 adam_beta2: 0.95 train_on_inputs: false group_by_length: false bf16: auto fp16: tf32: false gradient_checkpointing: true early_stopping_patience: resume_from_checkpoint: local_rank: logging_steps: 1 xformers_attention: flash_attention: true warmup_steps: 50 evals_per_epoch: eval_table_size: save_steps: 100 debug: deepspeed: weight_decay: 0.0 fsdp: fsdp_config: special_tokens: