|
batch_size: 1 |
|
checkpoint: true |
|
ckpt_freq: 100 |
|
data: |
|
data: /content/data/HansardSequences_250k.big.txt |
|
eval_instruct_data: '' |
|
instruct: |
|
dynamic_chunk_fn_call: true |
|
shuffle: true |
|
instruct_data: '' |
|
shuffle: false |
|
eval_freq: 100 |
|
log_freq: 1 |
|
lora: |
|
dropout: 0.0 |
|
enable: true |
|
rank: 64 |
|
scaling: 2.0 |
|
max_norm: 1.0 |
|
max_steps: 100 |
|
mlflow: |
|
experiment_name: null |
|
tracking_uri: null |
|
model_id_or_path: /content/mistral_models/7B-v0.3 |
|
no_ckpt: false |
|
no_eval: true |
|
num_ckpt_keep: 3 |
|
num_microbatches: 8 |
|
optim: |
|
lr: 0.0001 |
|
pct_start: 0.05 |
|
weight_decay: 0.1 |
|
run_dir: /content/debategpt |
|
save_adapters: true |
|
seed: 0 |
|
seq_len: 8192 |
|
wandb: |
|
key: null |
|
offline: false |
|
project: null |
|
run_name: null |
|
world_size: 1 |
|
|