chatlawv1 / trlx /configs /test_config.yml
teachyourselfcoding's picture
Upload 245 files
fa6856c
raw
history blame
No virus
1.92 kB
train:
seq_length: 64 # Size of LM context
epochs: 100 # Train for max(epochs, total_steps)
total_steps: 1000 # Train for max(epochs, total_steps)
batch_size: 16 # batch size
checkpoint_interval: 10000 # checkpoint interval
eval_interval: 128 # eval interval
pipeline: "PromptPipeline" # prompt pipeline to load
trainer: "AcceleratePPOTrainer" # Name of model trainer to load
model:
model_path: "lvwerra/gpt2-imdb" # Name of hf model to load
num_layers_unfrozen: 2 # Number of bottom layers to freeze during training
tokenizer:
tokenizer_path: "gpt2" # Name of hf tokenizer to load
truncation_side: "right" # Trim this side of samples if they are longer than LM context
optimizer:
name: "adamw" # Name of optimizer to load
kwargs:
lr: 1.412e-4 # Learning rate
betas: [0.9, 0.95] # Adam betas
eps: 1.0e-8 # Adam eps
weight_decay: 1.0e-6 # Weight decay param
scheduler:
name: "cosine_annealing" # Name of learning rate scheduler
kwargs:
T_max: 10000 # Maximum number of steps
eta_min: 1.412e-4 # Minimum learning rate
method:
name: "ppoconfig" # Name of RL method config
num_rollouts: 128 # Number of rollouts to collect per epoch
chunk_size: 128 # Number of rollouts to collect in one loop
ppo_epochs: 4 # Number of ppo epochs
init_kl_coef: 0.2 # init kl coefficient
target: 6 # target kl coefficient, set None for fixed kl coef
horizon: 10000 # PPO horizon
gamma: 0.99 # PPO discount
lam: 0.95 # PPO lambda
cliprange: 0.2 # clip range
cliprange_value: 0.2 # clip range
vf_coef: 1.0 # value term weight
scale_reward: "running" # False|"ref"|"running" estimate against which to scale rewards
cliprange_reward: 10
ref_mean: null
ref_std: null
gen_kwargs:
max_length: 48 # LM max sample gen length
min_length: 48 # LM min sample gen length
top_k: 0.0 # top k
top_p: 1.0 # top p
do_sample: True # sample