winglian commited on
Commit
f620706
β€’
2 Parent(s): e944311 77762a5

Merge pull request #189 from OpenAccess-AI-Collective/fixes-20230711

Browse files
configs/accelerate/default_config.yaml DELETED
@@ -1,15 +0,0 @@
1
- compute_environment: LOCAL_MACHINE
2
- distributed_type: 'NO'
3
- downcast_bf16: 'no'
4
- gpu_ids: all
5
- machine_rank: 0
6
- main_training_function: main
7
- mixed_precision: bf16
8
- num_machines: 1
9
- num_processes: 1
10
- rdzv_backend: static
11
- same_network: true
12
- tpu_env: []
13
- tpu_use_cluster: false
14
- tpu_use_sudo: false
15
- use_cpu: false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/llama_13B_alpaca.yml DELETED
@@ -1,39 +0,0 @@
1
- base_model: huggyllama/llama-13b
2
- model_type: LlamaForCausalLM
3
- tokenizer_type: LlamaTokenizer
4
- load_in_8bit: true
5
- datasets:
6
- - path: anon8231489123/ShareGPT_Vicuna_unfiltered
7
- data_files: ShareGPT_V3_unfiltered_cleaned_split_no_imsorry.json
8
- type: sharegpt
9
- dataset_prepared_path: last_run_prepared
10
- val_set_size: 0.002
11
- adapter:
12
- lora_model_dir:
13
- sequence_len: 2048
14
- lora_r: 8
15
- lora_alpha: 16
16
- lora_dropout: 0.05
17
- lora_target_modules:
18
- - q_proj
19
- - v_proj
20
- lora_fan_in_fan_out: false
21
- wandb_project:
22
- wandb_watch:
23
- wandb_run_id:
24
- wandb_log_model:
25
- output_dir: ./llama-13b-sharegpt
26
- gradient_accumulation_steps: 1
27
- micro_batch_size: 2
28
- warmup_steps: 1000
29
- save_steps:
30
- eval_steps:
31
- num_epochs: 5
32
- learning_rate: 0.00003
33
- train_on_inputs: false
34
- group_by_length: false
35
- bf16: true
36
- tf32: true
37
- early_stopping_patience: 5
38
- resume_from_checkpoint:
39
- local_rank:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/llama_65B_alpaca.yml DELETED
@@ -1,44 +0,0 @@
1
- base_model: huggyllama/llama-65b
2
- model_type: LlamaForCausalLM
3
- tokenizer_type: LlamaTokenizer
4
- load_in_8bit: true
5
- datasets:
6
- - path: data/alpaca_data_gpt4.jsonl
7
- type: alpaca
8
- - path: anon8231489123/ShareGPT_Vicuna_unfiltered
9
- data_files: ShareGPT_V3_unfiltered_cleaned_split_no_imsorry.json
10
- type: sharegpt
11
- - path: data/gpt4-instruct-similarity-0.6-dataset.jsonl
12
- type: gpteacher
13
- - path: data/roleplay-similarity_0.6-instruct-dataset.jsonl
14
- type: gpteacher
15
- dataset_prepared_path: last_run_prepared
16
- val_set_size: 0.04
17
- adapter: lora
18
- lora_model_dir:
19
- sequence_len: 2048
20
- lora_r: 8
21
- lora_alpha: 16
22
- lora_dropout: 0.05
23
- lora_target_modules:
24
- - q_proj
25
- - v_proj
26
- lora_fan_in_fan_out: false
27
- wandb_project: llama-65b-lora
28
- wandb_watch:
29
- wandb_run_id:
30
- wandb_log_model:
31
- output_dir: ./lora-llama-alpaca
32
- gradient_accumulation_steps: 1
33
- micro_batch_size: 16
34
- warmup_steps: 1000
35
- save_steps:
36
- num_epochs: 5
37
- learning_rate: 0.00003
38
- train_on_inputs: false
39
- group_by_length: false
40
- bf16: true
41
- tf32: true
42
- early_stopping_patience:
43
- resume_from_checkpoint:
44
- local_rank:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/llama_7B_4bit.yml DELETED
@@ -1,45 +0,0 @@
1
- base_model: decapoda-research/llama-7b-hf-int4
2
- base_model_config: decapoda-research/llama-7b-hf
3
- model_type: LlamaForCausalLM
4
- tokenizer_type: LlamaTokenizer
5
- load_in_8bit: true
6
- datasets:
7
- - path: tatsu-lab/alpaca # original alpaca dataset
8
- type: alpaca
9
- dataset_prepared_path: data/last_run_prepared
10
- val_set_size: 0.04
11
- adapter: lora
12
- lora_model_dir:
13
- sequence_len: 2048
14
- max_packed_sequence_len: 1024
15
- lora_r: 8
16
- lora_alpha: 16
17
- lora_dropout: 0.05
18
- lora_target_modules:
19
- - q_proj
20
- - v_proj
21
- # - k_proj
22
- # - o_proj
23
- lora_fan_in_fan_out: false
24
- wandb_project:
25
- wandb_watch:
26
- wandb_run_id:
27
- wandb_log_model:
28
- output_dir: ./lora-test
29
- gradient_accumulation_steps: 1
30
- micro_batch_size: 2
31
- num_epochs: 3
32
- warmup_steps: 100
33
- learning_rate: 0.00003
34
- train_on_inputs: false
35
- group_by_length: false
36
- bf16: true
37
- tf32: true
38
- gradient_checkpointing: false
39
- early_stopping_patience: 3
40
- resume_from_checkpoint:
41
- auto_resume_from_checkpoints: true
42
- local_rank:
43
- load_4bit: true
44
- xformers_attention: true
45
- flash_attention:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/quickstart.yml DELETED
@@ -1,45 +0,0 @@
1
- base_model: decapoda-research/llama-7b-hf-int4
2
- base_model_config: decapoda-research/llama-7b-hf
3
- model_type: LlamaForCausalLM
4
- tokenizer_type: LlamaTokenizer
5
- load_in_8bit: true
6
- datasets:
7
- - path: tatsu-lab/alpaca # original alpaca dataset
8
- type: alpaca
9
- dataset_prepared_path: data/last_run_prepared
10
- val_set_size: 0.04
11
- adapter: lora
12
- lora_model_dir:
13
- sequence_len: 1024
14
- max_packed_sequence_len: 1024
15
- lora_r: 8
16
- lora_alpha: 16
17
- lora_dropout: 0.05
18
- lora_target_modules:
19
- - q_proj
20
- - v_proj
21
- # - k_proj
22
- # - o_proj
23
- lora_fan_in_fan_out: false
24
- wandb_project:
25
- wandb_watch:
26
- wandb_run_id:
27
- wandb_log_model:
28
- output_dir: ./lora-test
29
- gradient_accumulation_steps: 1
30
- micro_batch_size: 1
31
- num_epochs: 3
32
- warmup_steps: 100
33
- learning_rate: 0.00003
34
- train_on_inputs: false
35
- group_by_length: false
36
- bf16: true
37
- tf32: true
38
- gradient_checkpointing: false
39
- early_stopping_patience: 3
40
- resume_from_checkpoint:
41
- auto_resume_from_checkpoints: true
42
- local_rank:
43
- gptq: true
44
- xformers_attention: true
45
- flash_attention:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/vicuna_13B_4bit_reflect.yml DELETED
@@ -1,45 +0,0 @@
1
- base_model: anon8231489123/vicuna-13b-GPTQ-4bit-128g
2
- base_model_config: anon8231489123/vicuna-13b-GPTQ-4bit-128g
3
- model_type: LlamaForCausalLM
4
- tokenizer_type: LlamaTokenizer
5
- load_in_8bit: false
6
- load_4bit: true
7
- gptq_groupsize: 128
8
- gptq_model_v1: false
9
- datasets:
10
- # https://github.com/vaguenebula/AlpacaDataReflect/blob/main/alpaca_reflect_pruned.json
11
- - path: data/alpaca_reflect_pruned.jsonl
12
- type: reflection
13
- dataset_prepared_path: data/last_run_prepared
14
- val_set_size: 0.04
15
- adapter: lora
16
- lora_model_dir:
17
- sequence_len: 2048
18
- max_packed_sequence_len: 2048
19
- lora_r: 8
20
- lora_alpha: 16
21
- lora_dropout: 0.05
22
- lora_target_modules:
23
- - q_proj
24
- - v_proj
25
- # - k_proj
26
- # - o_proj
27
- lora_fan_in_fan_out: false
28
- wandb_project:
29
- wandb_watch:
30
- wandb_run_id:
31
- wandb_log_model:
32
- output_dir: ./lora-reflect
33
- gradient_accumulation_steps: 1
34
- micro_batch_size: 2
35
- num_epochs: 3
36
- learning_rate: 0.00003
37
- train_on_inputs: false
38
- group_by_length: false
39
- bf16: true
40
- tf32: true
41
- gradient_checkpointing: false
42
- early_stopping_patience: 3
43
- resume_from_checkpoint:
44
- local_rank:
45
- flash_attention: true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/pythia_1_2B_alpaca.yml β†’ examples/pythia/lora.yml RENAMED
@@ -1,36 +1,29 @@
1
  base_model: EleutherAI/pythia-1.4b-deduped
2
- model_type: GPTNeoXForCausalLM
3
- tokenizer_type: AutoTokenizer
4
  load_in_8bit: true
5
  datasets:
6
- - path: data/alpaca_data_gpt4.jsonl
7
  type: alpaca
8
- - path: data/vicuna_cleaned.jsonl
9
- type: sharegpt
10
- - path: data/gpt4-instruct-similarity-0.6-dataset.jsonl
11
- type: gpteacher
12
- - path: data/roleplay-similarity_0.6-instruct-dataset.jsonl
13
- type: gpteacher
14
  dataset_prepared_path: last_run_prepared
15
  val_set_size: 0.05
16
  adapter: lora
17
  lora_model_dir:
18
- sequence_len: 2048
19
- lora_r: 8
20
  lora_alpha: 32
21
  lora_dropout: 0.05
22
  lora_target_modules:
23
  - query_key_value
24
- # - xxx
25
  lora_fan_in_fan_out: true # pythia/GPTNeoX lora specific
26
- wandb_project: pythia-1.4b-lora
27
  wandb_watch:
28
  wandb_run_id:
29
  wandb_log_model:
30
- output_dir: ./lora-alpaca
31
  gradient_accumulation_steps: 1
32
  micro_batch_size: 4
33
- num_epochs: 5
34
  learning_rate: 0.00001
35
  train_on_inputs: false
36
  group_by_length: false
@@ -39,3 +32,6 @@ tf32: True
39
  early_stopping_patience:
40
  resume_from_checkpoint:
41
  local_rank:
 
 
 
 
1
  base_model: EleutherAI/pythia-1.4b-deduped
2
+ base_model_config: EleutherAI/pythia-1.4b-deduped
 
3
  load_in_8bit: true
4
  datasets:
5
+ - path: teknium/GPT4-LLM-Cleaned
6
  type: alpaca
 
 
 
 
 
 
7
  dataset_prepared_path: last_run_prepared
8
  val_set_size: 0.05
9
  adapter: lora
10
  lora_model_dir:
11
+ sequence_len: 512
12
+ lora_r: 16
13
  lora_alpha: 32
14
  lora_dropout: 0.05
15
  lora_target_modules:
16
  - query_key_value
17
+ lora_target_linear:
18
  lora_fan_in_fan_out: true # pythia/GPTNeoX lora specific
19
+ wandb_project:
20
  wandb_watch:
21
  wandb_run_id:
22
  wandb_log_model:
23
+ output_dir: ./lora-alpaca-pythia
24
  gradient_accumulation_steps: 1
25
  micro_batch_size: 4
26
+ num_epochs: 3
27
  learning_rate: 0.00001
28
  train_on_inputs: false
29
  group_by_length: false
 
32
  early_stopping_patience:
33
  resume_from_checkpoint:
34
  local_rank:
35
+ weight_decay: 0.1
36
+ eval_steps: 20
37
+ logging_steps: 1
src/axolotl/utils/models.py CHANGED
@@ -305,7 +305,9 @@ def load_model(
305
  or (cfg.adapter == "qlora" and cfg.load_in_4bit)
306
  ):
307
  logging.info("converting PEFT model w/ prepare_model_for_kbit_training")
308
- model = prepare_model_for_kbit_training(model)
 
 
309
 
310
  model, lora_config = load_adapter(model, cfg, adapter)
311
 
 
305
  or (cfg.adapter == "qlora" and cfg.load_in_4bit)
306
  ):
307
  logging.info("converting PEFT model w/ prepare_model_for_kbit_training")
308
+ model = prepare_model_for_kbit_training(
309
+ model, use_gradient_checkpointing=cfg.gradient_checkpointing
310
+ )
311
 
312
  model, lora_config = load_adapter(model, cfg, adapter)
313
 
src/axolotl/utils/validation.py CHANGED
@@ -57,6 +57,11 @@ def validate_config(cfg):
57
  if (cfg.base_model and "falcon" in cfg.base_model.lower()) and cfg.fsdp:
58
  raise ValueError("FSDP is not supported for falcon models")
59
 
 
 
 
 
 
60
  # TODO
61
  # MPT 7b
62
  # https://github.com/facebookresearch/bitsandbytes/issues/25
 
57
  if (cfg.base_model and "falcon" in cfg.base_model.lower()) and cfg.fsdp:
58
  raise ValueError("FSDP is not supported for falcon models")
59
 
60
+ if (
61
+ cfg.base_model and "mpt" in cfg.base_model.lower()
62
+ ) and cfg.gradient_checkpointing:
63
+ raise ValueError("gradient_checkpointing is not supported for MPT models")
64
+
65
  # TODO
66
  # MPT 7b
67
  # https://github.com/facebookresearch/bitsandbytes/issues/25
tests/test_validation.py CHANGED
@@ -198,3 +198,17 @@ class ValidationTest(unittest.TestCase):
198
  )
199
 
200
  validate_config(cfg)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
  )
199
 
200
  validate_config(cfg)
201
+
202
+ def test_mpt_gradient_checkpointing(self):
203
+ regex_exp = r".*gradient_checkpointing is not supported for MPT models*"
204
+
205
+ # Check for lower-case
206
+ cfg = DictDefault(
207
+ {
208
+ "base_model": "mosaicml/mpt-7b",
209
+ "gradient_checkpointing": True,
210
+ }
211
+ )
212
+
213
+ with pytest.raises(ValueError, match=regex_exp):
214
+ validate_config(cfg)