andreaskoepf commited on
Commit
5792c3e
1 Parent(s): 57ee494

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +111 -0
README.md CHANGED
@@ -1,3 +1,114 @@
1
  ---
2
  license: apache-2.0
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: apache-2.0
3
  ---
4
+
5
+ wandb run: https://wandb.ai/open-assistant/supervised-finetuning/runs/bxyaxo4v
6
+ Checkpoint: 2000 steps ~48% 1st epoch
7
+
8
+
9
+ Datasets:
10
+
11
+ ```
12
+ pretrain:
13
+ use_custom_sampler: true
14
+ sort_by_length: false
15
+ datasets:
16
+ - joke
17
+ - webgpt:
18
+ val_split: 0.1
19
+ - gpt4all:
20
+ val_split: 0.01
21
+ - alpaca:
22
+ val_split: 0.025
23
+ - code_alpaca:
24
+ val_split: 0.05
25
+ - minimath
26
+ - humaneval_mbpp_codegen_qa
27
+ - humaneval_mbpp_testgen_qa
28
+ - grade_school_math_instructions
29
+ - recipes
30
+ - cmu_wiki_qa
31
+ #- youtube_subs_howto100m # uses incompatible column names
32
+ #- ubuntu_dialogue_qa # fails to load
33
+ - oa_wiki_qa_bart_10000row
34
+ - prosocial_dialogue:
35
+ fraction: 0.1
36
+ - explain_prosocial:
37
+ fraction: 0.05
38
+ ```
39
+
40
+
41
+ Pythia:
42
+ ```
43
+ pythia-12b:
44
+ dtype: fp16
45
+ log_dir: "pythia_log_12b"
46
+ learning_rate: 6e-6
47
+ model_name: EleutherAI/pythia-12b-deduped
48
+ output_dir: pythia_model_12b
49
+ weight_decay: 0.0
50
+ max_length: 2048
51
+ use_flash_attention: true
52
+ deepspeed_config: configs/zero_conf2.json
53
+ warmup_steps: 50
54
+ gradient_checkpointing: true
55
+ gradient_accumulation_steps: 2
56
+ per_device_train_batch_size: 8
57
+ per_device_eval_batch_size: 5
58
+ eval_steps: 200
59
+ save_steps: 500
60
+ num_train_epochs: 2
61
+ save_total_limit: 2
62
+
63
+ ```
64
+
65
+
66
+ zero_conf2.json:
67
+ ```
68
+ {
69
+ "fp16": {
70
+ "enabled": "auto",
71
+ "loss_scale": 0,
72
+ "loss_scale_window": 1000,
73
+ "initial_scale_power": 16,
74
+ "hysteresis": 2,
75
+ "min_loss_scale": 1
76
+ },
77
+ "bf16": {
78
+ "enabled": "auto"
79
+ },
80
+ "optimizer": {
81
+ "type": "AdamW",
82
+ "params": {
83
+ "lr": "auto",
84
+ "betas": "auto",
85
+ "eps": "auto",
86
+ "weight_decay": "auto"
87
+ }
88
+ },
89
+ "scheduler": {
90
+ "type": "WarmupDecayLR",
91
+ "params": {
92
+ "warmup_min_lr": "auto",
93
+ "warmup_max_lr": "auto",
94
+ "warmup_num_steps": "auto",
95
+ "total_num_steps": "auto"
96
+ }
97
+ },
98
+ "zero_optimization": {
99
+ "stage": 2,
100
+ "allgather_partitions": true,
101
+ "allgather_bucket_size": 1000000000.0,
102
+ "overlap_comm": false,
103
+ "reduce_scatter": true,
104
+ "reduce_bucket_size": 1000000000.0,
105
+ "contiguous_gradients": true
106
+ },
107
+ "gradient_accumulation_steps": "auto",
108
+ "gradient_clipping": "auto",
109
+ "steps_per_print": 2000,
110
+ "train_batch_size": "auto",
111
+ "train_micro_batch_size_per_gpu": "auto",
112
+ "wall_clock_breakdown": false
113
+ }
114
+ ```