{ "base_model_id": "deepseek-ai/deepseek-coder-1.3b-instruct", "quantitize": "fp16", "dataset": "Arithmetic_Hard", "data_collator": "DataCollatorForLanguageModeling", "peft_config": { "lora": { "r": 32, "lora_alpha": 64, "target_modules": [ "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj" ], "bias": "none", "lora_dropout": 0.05, "task_type": "CAUSAL_LM" }, "lora_large": { "r": 128, "lora_alpha": 256, "target_modules": [ "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj" ], "bias": "none", "lora_dropout": 0.05, "task_type": "CAUSAL_LM" }, "p_tuning": { "num_virtual_tokens": 16, "num_transformer_submodules": 1, "token_dim": 2048, "encoder_hidden_size": 2048, "task_type": "CAUSAL_LM" } }, "training_args": { "warmup_steps": 500, "per_device_train_batch_size": 4, "per_device_eval_batch_size": 4, "gradient_accumulation_steps": 1, "max_steps": 100000, "learning_rate": 0.0001, "optim": "paged_adamw_8bit", "logging_steps": 100, "save_strategy": "steps", "save_total_limit": 5, "save_steps": 2500, "evaluation_strategy": "steps", "eval_steps": 2500, "weight_decay": 0.01, "report_to": "wandb", "dataloader_num_workers": 4, "load_best_model_at_end": true, "fp16": false, "output_dir": "runs/deepseek-full-hard", "logging_dir": "runs/deepseek-full-hard/logs" }, "tokenizer": { "tokenize_config": { "truncation": true, "max_length": 512, "padding": "max_length" }, "prompt_template": "config/qa_template.txt" } }