File size: 2,394 Bytes
4dbda47 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
batch_size_training: '4'
batching_strategy: padding
checkpoint_type: StateDictType.SHARDED_STATE_DICT
context_length: '8192'
curriculum_learning: 'False'
curriculum_phases: '3'
dataset: '[''OpenCoderSFTStage2'']'
ddp_timeout: '36000'
debug: 'False'
decay_steps: None
dist_checkpoint_folder: fine-tuned
drop_last: 'True'
dynamic_batch_size: 'False'
enable_deepspeed: 'False'
enable_fsdp: 'True'
enable_memory_profiling: 'False'
enable_memory_trace: 'False'
enable_mixed_precision: 'True'
enable_tensorboard: 'True'
eta_min: 1e-05
eval_epoch: '1'
eval_in_memory: 'False'
eval_steps: '1000'
evaluation_strategy: steps
flop_counter: 'False'
flop_counter_start: '3'
fp16: 'False'
freeze_layers: 'False'
from_peft_checkpoint: ''
fsdp_activation_checkpointing: 'True'
fsdp_cpu_offload: 'False'
fsdp_cpu_ram_efficient_loading: 'False'
gamma: '0.85'
gradient_accumulation_steps: '8'
gradient_checkpointing: 'True'
gradient_checkpointing_kwargs: '{''use_reentrant'': False}'
gradient_clipping: 'False'
gradient_clipping_threshold: '1.0'
handle_long_sequences: 'True'
hf_hub_metrics_cache_dir: /shared/public/data/controlllm/metrics/
hsdp: 'True'
learning_rate: 5e-05
load_best_model_at_end: 'False'
logging_steps: '500'
long_sequence_threshold: '16384'
low_cpu_fsdp: 'False'
lr: '0.0001'
lr_scheduler_per_iter: 'True'
max_eval_step: '500'
max_grad_norm: '1.0'
max_step: '0'
max_tokens_per_batch: '-1'
max_train_step: '-1'
memory_per_token: '-1'
mixed_precision: 'True'
model_name: PATH/to/Model
no_cuda: 'False'
num_epochs: '3'
num_freeze_layers: '1'
num_train_epochs: '20'
num_unfrozen_layers: '8'
num_workers_dataloader: '0'
one_gpu: 'False'
optimizer: AdamW
overwrite_output_dir: 'False'
peft_method: lora
per_device_eval_batch_size: '1'
per_device_train_batch_size: '12'
precompute_batches: None
pure_bf16: 'False'
quantization: 'False'
replica_group_size: '1'
resume_checkpoint_folder: None
resume_from_latest: 'True'
run_validation: 'True'
save_epoch: '1'
save_metrics: 'False'
save_model: 'True'
save_optimizer: 'False'
save_steps: '1000'
seed: '42'
sharding_group_size: '8'
sharding_strategy: ShardingStrategy.HYBRID_SHARD
step_size: '1'
tokenizer_name: None
trainer: native
unfrozen_strategy: interweave
use_fast_kernels: 'False'
use_fp16: 'False'
use_peft: 'False'
use_profiler: 'False'
use_wandb: 'False'
val_batch_size: '1'
warmup_steps: '1000'
weight_decay: '0.01'
weight_decay_ratio: '0.1'
|