train: seq_length: 64 # Size of LM context epochs: 100 # Train for max(epochs, total_steps) total_steps: 1000 # Train for max(epochs, total_steps) batch_size: 16 # batch size checkpoint_interval: 10000 # checkpoint interval eval_interval: 128 # eval interval pipeline: "PromptPipeline" # prompt pipeline to load trainer: "AcceleratePPOTrainer" # Name of model trainer to load model: model_path: "lvwerra/gpt2-imdb" # Name of hf model to load num_layers_unfrozen: 2 # Number of bottom layers to freeze during training tokenizer: tokenizer_path: "gpt2" # Name of hf tokenizer to load truncation_side: "right" # Trim this side of samples if they are longer than LM context optimizer: name: "adamw" # Name of optimizer to load kwargs: lr: 1.412e-4 # Learning rate betas: [0.9, 0.95] # Adam betas eps: 1.0e-8 # Adam eps weight_decay: 1.0e-6 # Weight decay param scheduler: name: "cosine_annealing" # Name of learning rate scheduler kwargs: T_max: 10000 # Maximum number of steps eta_min: 1.412e-4 # Minimum learning rate method: name: "ppoconfig" # Name of RL method config num_rollouts: 128 # Number of rollouts to collect per epoch chunk_size: 128 # Number of rollouts to collect in one loop ppo_epochs: 4 # Number of ppo epochs init_kl_coef: 0.2 # init kl coefficient target: 6 # target kl coefficient, set None for fixed kl coef horizon: 10000 # PPO horizon gamma: 0.99 # PPO discount lam: 0.95 # PPO lambda cliprange: 0.2 # clip range cliprange_value: 0.2 # clip range vf_coef: 1.0 # value term weight scale_reward: "running" # False|"ref"|"running" estimate against which to scale rewards cliprange_reward: 10 ref_mean: null ref_std: null gen_kwargs: max_length: 48 # LM max sample gen length min_length: 48 # LM min sample gen length top_k: 0.0 # top k top_p: 1.0 # top p do_sample: True # sample