|
{ |
|
"base_model_id": "deepseek-ai/deepseek-coder-1.3b-instruct", |
|
"quantitize": "fp16", |
|
"dataset": "Arithmetic_Hard", |
|
"data_collator": "DataCollatorForLanguageModeling", |
|
"peft_config": { |
|
"lora": { |
|
"r": 32, |
|
"lora_alpha": 64, |
|
"target_modules": [ |
|
"q_proj", |
|
"k_proj", |
|
"v_proj", |
|
"o_proj", |
|
"gate_proj", |
|
"up_proj", |
|
"down_proj" |
|
], |
|
"bias": "none", |
|
"lora_dropout": 0.05, |
|
"task_type": "CAUSAL_LM" |
|
}, |
|
"lora_large": { |
|
"r": 128, |
|
"lora_alpha": 256, |
|
"target_modules": [ |
|
"q_proj", |
|
"k_proj", |
|
"v_proj", |
|
"o_proj", |
|
"gate_proj", |
|
"up_proj", |
|
"down_proj" |
|
], |
|
"bias": "none", |
|
"lora_dropout": 0.05, |
|
"task_type": "CAUSAL_LM" |
|
}, |
|
"p_tuning": { |
|
"num_virtual_tokens": 16, |
|
"num_transformer_submodules": 1, |
|
"token_dim": 2048, |
|
"encoder_hidden_size": 2048, |
|
"task_type": "CAUSAL_LM" |
|
} |
|
}, |
|
"training_args": { |
|
"warmup_steps": 500, |
|
"per_device_train_batch_size": 4, |
|
"per_device_eval_batch_size": 4, |
|
"gradient_accumulation_steps": 1, |
|
"max_steps": 100000, |
|
"learning_rate": 0.0001, |
|
"optim": "paged_adamw_8bit", |
|
"logging_steps": 100, |
|
"save_strategy": "steps", |
|
"save_total_limit": 5, |
|
"save_steps": 2500, |
|
"evaluation_strategy": "steps", |
|
"eval_steps": 2500, |
|
"weight_decay": 0.01, |
|
"report_to": "wandb", |
|
"dataloader_num_workers": 4, |
|
"load_best_model_at_end": true, |
|
"fp16": false, |
|
"output_dir": "runs/deepseek-full-hard", |
|
"logging_dir": "runs/deepseek-full-hard/logs" |
|
}, |
|
"tokenizer": { |
|
"tokenize_config": { |
|
"truncation": true, |
|
"max_length": 512, |
|
"padding": "max_length" |
|
}, |
|
"prompt_template": "config/qa_template.txt" |
|
} |
|
} |
|
|