train:
  seq_length: 64 # Size of LM context
  epochs: 100 # Train for max(epochs, total_steps)
  total_steps: 1000 # Train for max(epochs, total_steps)
  batch_size: 16 # batch size

  checkpoint_interval: 10000 # checkpoint interval
  eval_interval: 128 # eval interval

  pipeline: "PromptPipeline" # prompt pipeline to load
  trainer: "AcceleratePPOTrainer" # Name of model trainer to load

model:
  model_path: "lvwerra/gpt2-imdb" # Name of hf model to load
  num_layers_unfrozen: 2 # Number of bottom layers to freeze during training

tokenizer:
  tokenizer_path: "gpt2" # Name of hf tokenizer to load
  truncation_side: "right" # Trim this side of samples if they are longer than LM context

optimizer:
  name: "adamw" # Name of optimizer to load
  kwargs:
    lr: 1.412e-4 # Learning rate
    betas: [0.9, 0.95] # Adam betas
    eps: 1.0e-8 # Adam eps
    weight_decay: 1.0e-6 # Weight decay param

scheduler:
  name: "cosine_annealing" # Name of learning rate scheduler
  kwargs:
    T_max: 10000 # Maximum number of steps
    eta_min: 1.412e-4 # Minimum learning rate

method:
  name: "ppoconfig" # Name of RL method config
  num_rollouts: 128 # Number of rollouts to collect per epoch
  chunk_size: 128 # Number of rollouts to collect in one loop
  ppo_epochs: 4 # Number of ppo epochs
  init_kl_coef: 0.2 # init kl coefficient
  target: 6 # target kl coefficient, set None for fixed kl coef
  horizon: 10000 # PPO horizon
  gamma: 0.99 # PPO discount
  lam: 0.95 # PPO lambda
  cliprange: 0.2 # clip range
  cliprange_value: 0.2 # clip range
  vf_coef: 1.0 # value term weight
  scale_reward: "running" # False|"ref"|"running" estimate against which to scale rewards
  cliprange_reward: 10
  ref_mean: null
  ref_std: null
  gen_kwargs:
    max_length: 48 # LM max sample gen length
    min_length: 48 # LM min sample gen length
    top_k: 0.0 # top k
    top_p: 1.0 # top p
    do_sample: True # sample