DDDano333 commited on
Commit
07e2f18
1 Parent(s): 22c6032

switched to accelerator

Browse files
Files changed (1) hide show
  1. train.py +38 -45
train.py CHANGED
@@ -1,37 +1,29 @@
1
  import os
2
  import torch
3
  import torch.nn as nn
4
- import torch.distributed as dist
5
- import torch.multiprocessing as mp
6
  import bitsandbytes as bnb
7
  from datasets import load_dataset
8
  import transformers
9
  from transformers import AutoTokenizer, AutoConfig, LLaMAForCausalLM, LLaMATokenizer
10
  from peft import prepare_model_for_int8_training, LoraConfig, get_peft_model
11
 
12
- def setup(rank, world_size):
13
- os.environ['MASTER_ADDR'] = 'localhost'
14
- os.environ['MASTER_PORT'] = '12355'
15
-
16
- # Initialize the process group
17
- dist.init_process_group("nccl", rank=rank, world_size=world_size)
18
-
19
- def cleanup():
20
- dist.destroy_process_group()
21
-
22
- def train(rank, world_size):
23
- setup(rank, world_size)
24
-
25
- # os.system("nvidia-smi")
26
- # os.system("git clone https://github.com/tloen/alpaca-lora.git")
27
- # os.chdir("alpaca-lora/")
28
- # os.system("pip install -q datasets loralib sentencepiece")
29
- # os.system("pip uninstall -y transformers")
30
- # os.system("pip install -q git+https://github.com/zphang/transformers@c3dc391")
31
- # os.system("pip install -q git+https://github.com/huggingface/peft.git")
32
- # os.system("pip install bitsandbytes")
33
-
34
- # os.system("conda install -y -c conda-forge cudatoolkit")
35
 
36
  MICRO_BATCH_SIZE = 1
37
  BATCH_SIZE = 16
@@ -42,18 +34,20 @@ def train(rank, world_size):
42
  LORA_ALPHA = 8
43
  LORA_DROPOUT = 0.05
44
 
45
- device = torch.device(f"cuda:{rank}")
 
46
  model = LLaMAForCausalLM.from_pretrained(
47
  "decapoda-research/llama-7b-hf",
48
  load_in_8bit=True,
49
  device_map="auto",
50
  )
51
- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[rank], output_device=rank)
 
52
  tokenizer = LLaMATokenizer.from_pretrained(
53
  "decapoda-research/llama-7b-hf", add_eos_token=True
54
  )
55
 
56
- model = prepare_model_for_int8_training(model.module)
57
 
58
  config = LoraConfig(
59
  r=LORA_R,
@@ -63,7 +57,7 @@ def train(rank, world_size):
63
  bias="none",
64
  task_type="CAUSAL_LM",
65
  )
66
- model.module = get_peft_model(model.module, config)
67
  tokenizer.pad_token_id = 0
68
  data = load_dataset("json", data_files="../samples.json")
69
 
@@ -89,29 +83,28 @@ def train(rank, world_size):
89
  )
90
  )
91
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  trainer = transformers.Trainer(
93
  model=model,
94
  train_dataset=data["train"],
95
- args=transformers.TrainingArguments(
96
- per_device_train_batch_size=MICRO_BATCH_SIZE,
97
- gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
98
- warmup_steps=100,
99
- num_train_epochs=EPOCHS,
100
- learning_rate=LEARNING_RATE,
101
- fp16=True,
102
- logging_steps=1,
103
- output_dir=f"lora-smartscraper-{rank}",
104
- save_total_limit=3,
105
- ),
106
  data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
107
  )
108
  model.config.use_cache = False
109
  trainer.train(resume_from_checkpoint=False)
110
 
111
- model.save_pretrained(f"lora-smartscraper-{rank}")
112
-
113
- cleanup()
114
 
115
  if __name__ == "__main__":
116
- world_size = torch.cuda.device_count()
117
- mp.spawn(train, args=(world_size,), nprocs=world_size, join=True)
 
1
  import os
2
  import torch
3
  import torch.nn as nn
 
 
4
  import bitsandbytes as bnb
5
  from datasets import load_dataset
6
  import transformers
7
  from transformers import AutoTokenizer, AutoConfig, LLaMAForCausalLM, LLaMATokenizer
8
  from peft import prepare_model_for_int8_training, LoraConfig, get_peft_model
9
 
10
+ # Import the necessary Accelerate modules
11
+ from accelerate import Accelerator, DistributedType
12
+
13
+ def train():
14
+ # Initialize the Accelerator
15
+ accelerator = Accelerator(
16
+ device_placement=True,
17
+ split_batches=False,
18
+ mixed_precision="fp16",
19
+ # distributed_type=DistributedType.MULTI_GPU,
20
+ gradient_accumulation_steps=1,
21
+ rng_types=["torch", "cuda"],
22
+ log_with=["tensorboard", "wandb", "comet_ml"],
23
+ project_dir="./",
24
+ even_batches=True,
25
+ step_scheduler_with_optimizer=True
26
+ )
 
 
 
 
 
 
27
 
28
  MICRO_BATCH_SIZE = 1
29
  BATCH_SIZE = 16
 
34
  LORA_ALPHA = 8
35
  LORA_DROPOUT = 0.05
36
 
37
+ device = accelerator.device
38
+
39
  model = LLaMAForCausalLM.from_pretrained(
40
  "decapoda-research/llama-7b-hf",
41
  load_in_8bit=True,
42
  device_map="auto",
43
  )
44
+
45
+ model = accelerator.prepare(model)
46
  tokenizer = LLaMATokenizer.from_pretrained(
47
  "decapoda-research/llama-7b-hf", add_eos_token=True
48
  )
49
 
50
+ model = prepare_model_for_int8_training(model)
51
 
52
  config = LoraConfig(
53
  r=LORA_R,
 
57
  bias="none",
58
  task_type="CAUSAL_LM",
59
  )
60
+ model = get_peft_model(model, config)
61
  tokenizer.pad_token_id = 0
62
  data = load_dataset("json", data_files="../samples.json")
63
 
 
83
  )
84
  )
85
 
86
+ training_args = transformers.TrainingArguments(
87
+ per_device_train_batch_size=MICRO_BATCH_SIZE,
88
+ gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
89
+ warmup_steps=100,
90
+ num_train_epochs=EPOCHS,
91
+ learning_rate=LEARNING_RATE,
92
+ logging_steps=1,
93
+ output_dir=f"lora-smartscraper-{accelerator.process_index}",
94
+ save_total_limit=3,
95
+ )
96
+ # training_args = accelerator.update_arguments(training_args)
97
+
98
  trainer = transformers.Trainer(
99
  model=model,
100
  train_dataset=data["train"],
101
+ args=training_args,
 
 
 
 
 
 
 
 
 
 
102
  data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
103
  )
104
  model.config.use_cache = False
105
  trainer.train(resume_from_checkpoint=False)
106
 
107
+ model.save_pretrained(f"lora-smartscraper-{accelerator.process_index}")
 
 
108
 
109
  if __name__ == "__main__":
110
+ train()