wp931120x commited on
Commit
3ff0b2b
1 Parent(s): 97da9c7

Upload sft_lora.py

Browse files

supervised finetuninig by lora

Files changed (1) hide show
  1. sft_lora.py +167 -0
sft_lora.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ os.environ['CUDA_VISIBLE_DEVICES'] = '0'
3
+ from datasets import load_dataset
4
+ import transformers
5
+ from transformers import Trainer, TrainingArguments
6
+ from transformers import AutoTokenizer, AutoModelForCausalLM
7
+ from transformers import BitsAndBytesConfig
8
+ from peft import (
9
+ LoraConfig,
10
+ get_peft_model,
11
+ prepare_model_for_kbit_training,
12
+ set_peft_model_state_dict,
13
+ )
14
+ import torch
15
+
16
+
17
+
18
+ CUTOFF_LEN = 1024 # 1024 accounts for about 96% of the data
19
+ VAL_SET_SIZE = 2000
20
+ DATA_PATH = "./dataset/Belle_open_source_0.5M.json" ## Choose dataset
21
+ OUTPUT_DIR = "baichuansft"
22
+ resume_from_checkpoint = "baichuansft"
23
+
24
+
25
+ device_map = {"": 0}
26
+ tokenizer = AutoTokenizer.from_pretrained("./baichuan-7B",trust_remote_code=True)
27
+ model = AutoModelForCausalLM.from_pretrained("./baichuan-7B",
28
+ trust_remote_code=True,
29
+ quantization_config=BitsAndBytesConfig(
30
+ load_in_4bit=True,
31
+ bnb_4bit_compute_dtype=torch.bfloat16,
32
+ bnb_4bit_use_double_quant=True,
33
+ bnb_4bit_quant_type='nf4'
34
+ ),
35
+ device_map=device_map)
36
+
37
+ model = prepare_model_for_kbit_training(model)
38
+
39
+ ### 所有的线性layer都装配上lora
40
+ import bitsandbytes as bnb
41
+ def find_all_linear_names(model):
42
+ #cls = bnb.nn.Linear8bitLt
43
+ cls = bnb.nn.Linear4bit
44
+ lora_module_names = set()
45
+ for name, module in model.named_modules():
46
+ if isinstance(module, cls):
47
+ names = name.split('.')
48
+ lora_module_names.add(names[0] if len(names) == 1 else names[-1])
49
+
50
+
51
+ if 'lm_head' in lora_module_names: # needed for 16-bit
52
+ lora_module_names.remove('lm_head')
53
+ return list(lora_module_names)
54
+ modules = find_all_linear_names(model)
55
+
56
+
57
+ config = LoraConfig(
58
+ r=8,
59
+ lora_alpha=16,
60
+ lora_dropout=0.05,
61
+ bias="none",
62
+ target_modules=modules,
63
+ task_type="CAUSAL_LM",
64
+ )
65
+
66
+
67
+ model = get_peft_model(model, config)
68
+ tokenizer.pad_token_id = 0
69
+
70
+ if resume_from_checkpoint:
71
+ # Check the available weights and load them
72
+ checkpoint_name = os.path.join(
73
+ resume_from_checkpoint, "pytorch_model.bin"
74
+ ) # Full checkpoint
75
+ if not os.path.exists(checkpoint_name):
76
+ checkpoint_name = os.path.join(
77
+ resume_from_checkpoint, "adapter_model.bin"
78
+ ) # only LoRA model - LoRA config above has to fit
79
+ resume_from_checkpoint = (
80
+ False # So the trainer won't try loading its state
81
+ )
82
+ # The two files above have a different name depending on how they were saved, but are actually the same.
83
+ if os.path.exists(checkpoint_name):
84
+ print(f"Restarting from {checkpoint_name}")
85
+ adapters_weights = torch.load(checkpoint_name)
86
+ set_peft_model_state_dict(model, adapters_weights)
87
+ else:
88
+ print(f"Checkpoint {checkpoint_name} not found")
89
+
90
+
91
+ data = load_dataset("json", data_files=DATA_PATH)
92
+
93
+ def tokenize(prompt, add_eos_token=True):
94
+ result = tokenizer(
95
+ prompt,
96
+ truncation=True,
97
+ max_length=CUTOFF_LEN,
98
+ padding=False,
99
+ return_tensors=None,
100
+ )
101
+ if (
102
+ result["input_ids"][-1] != tokenizer.eos_token_id
103
+ and len(result["input_ids"]) < CUTOFF_LEN
104
+ and add_eos_token
105
+ ):
106
+ result["input_ids"].append(tokenizer.eos_token_id)
107
+ result["attention_mask"].append(1)
108
+
109
+ if add_eos_token and len(result["input_ids"]) >= CUTOFF_LEN:
110
+ result["input_ids"][CUTOFF_LEN - 1] = tokenizer.eos_token_id
111
+ result["attention_mask"][CUTOFF_LEN - 1] = 1
112
+
113
+ result["labels"] = result["input_ids"].copy()
114
+
115
+ return result
116
+
117
+
118
+ def generate_and_tokenize_prompt(data_point):
119
+ instruction = data_point['instruction']
120
+ input_text = data_point["input"]
121
+ input_text = "Human: " + instruction + input_text + "\n\nAssistant: "
122
+ input_text = tokenizer.bos_token + input_text if tokenizer.bos_token != None else input_text
123
+ target_text = data_point["output"] + tokenizer.eos_token
124
+ full_prompt = input_text + target_text
125
+ tokenized_full_prompt = tokenize(full_prompt)
126
+ return tokenized_full_prompt
127
+
128
+
129
+ if VAL_SET_SIZE > 0:
130
+ train_val = data["train"].train_test_split(
131
+ test_size=VAL_SET_SIZE, shuffle=True, seed=42
132
+ )
133
+ train_data = train_val["train"].shuffle().map(generate_and_tokenize_prompt)
134
+ val_data = train_val["test"].shuffle().map(generate_and_tokenize_prompt)
135
+ else:
136
+ train_data = data['train'].shuffle().map(generate_and_tokenize_prompt)
137
+ val_data = None
138
+
139
+ trainer = Trainer(
140
+ model=model,
141
+ train_dataset=train_data,
142
+ eval_dataset=val_data,
143
+ args=TrainingArguments(
144
+ num_train_epochs=1,
145
+ per_device_train_batch_size=1,
146
+ per_device_eval_batch_size=1,
147
+ learning_rate=3e-4,
148
+ gradient_accumulation_steps=4,
149
+ evaluation_strategy="steps" if VAL_SET_SIZE > 0 else "no",
150
+ save_strategy="steps",
151
+ eval_steps=2000 if VAL_SET_SIZE > 0 else None,
152
+ save_steps=2000,
153
+ output_dir=OUTPUT_DIR,
154
+ report_to = "tensorboard",
155
+ save_total_limit=3,
156
+ load_best_model_at_end=True if VAL_SET_SIZE > 0 else False,
157
+ optim="adamw_torch"
158
+ ),
159
+ data_collator=transformers.DataCollatorForSeq2Seq(tokenizer,
160
+ pad_to_multiple_of=8,
161
+ return_tensors="pt",
162
+ padding=True),
163
+ )
164
+
165
+
166
+ trainer.train(resume_from_checkpoint=False)
167
+ model.save_pretrained(OUTPUT_DIR)