base_model: teknium/OpenHermes-2.5-Mistral-7B model_type: MistralForCausalLM tokenizer_type: LlamaTokenizer is_mistral_derived_model: true datasets: - path: rxavier/economicus type: sharegpt conversation: chatml #shards: 100 dataset_prepared_path: taurus/train/last_run_prepared val_set_size: 0.05 sequence_len: 2048 sample_packing: true pad_to_sequence_len: true adapter: qlora lora_r: 64 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true lora_target_modules: - q_proj - v_proj - k_proj - o_proj - gate_proj - down_proj - up_proj #lora_modules_to_save: # - embed_tokens # - lm_head load_in_8bit: false load_in_4bit: true strict: false bf16: true fp16: false tf32: false gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 2 optimizer: paged_adamw_8bit weight_decay: 0.0 lr_scheduler: cosine learning_rate: 0.0003 warmup_ratio: 0.03 adam_beta2: 0.95 adam_epsilon: 0.00001 max_grad_norm: 1.0 gradient_checkpointing: true early_stopping_patience: evals_per_epoch: 4 eval_table_size: eval_table_max_new_tokens: eval_sample_packing: false saves_per_epoch: 4 debug: deepspeed: fsdp: fsdp_config: train_on_inputs: false group_by_length: false resume_from_checkpoint: local_rank: logging_steps: 1 xformers_attention: flash_attention: true output_dir: taurus/train/output save_safetensors: true wandb_project: economicus wandb_entity: wandb_watch: wandb_run_id: wandb_log_model: #special_tokens: # eos_token: "<|im_end|>" #tokens: # - "<|im_start|>" # - "<|im_end|>" chat_template: chatml