File size: 5,765 Bytes
3ff0b2b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 |
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
from datasets import load_dataset
import transformers
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig
from peft import (
LoraConfig,
get_peft_model,
prepare_model_for_kbit_training,
set_peft_model_state_dict,
)
import torch
CUTOFF_LEN = 1024 # 1024 accounts for about 96% of the data
VAL_SET_SIZE = 2000
DATA_PATH = "./dataset/Belle_open_source_0.5M.json" ## Choose dataset
OUTPUT_DIR = "baichuansft"
resume_from_checkpoint = "baichuansft"
device_map = {"": 0}
tokenizer = AutoTokenizer.from_pretrained("./baichuan-7B",trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("./baichuan-7B",
trust_remote_code=True,
quantization_config=BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type='nf4'
),
device_map=device_map)
model = prepare_model_for_kbit_training(model)
### 所有的线性layer都装配上lora
import bitsandbytes as bnb
def find_all_linear_names(model):
#cls = bnb.nn.Linear8bitLt
cls = bnb.nn.Linear4bit
lora_module_names = set()
for name, module in model.named_modules():
if isinstance(module, cls):
names = name.split('.')
lora_module_names.add(names[0] if len(names) == 1 else names[-1])
if 'lm_head' in lora_module_names: # needed for 16-bit
lora_module_names.remove('lm_head')
return list(lora_module_names)
modules = find_all_linear_names(model)
config = LoraConfig(
r=8,
lora_alpha=16,
lora_dropout=0.05,
bias="none",
target_modules=modules,
task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)
tokenizer.pad_token_id = 0
if resume_from_checkpoint:
# Check the available weights and load them
checkpoint_name = os.path.join(
resume_from_checkpoint, "pytorch_model.bin"
) # Full checkpoint
if not os.path.exists(checkpoint_name):
checkpoint_name = os.path.join(
resume_from_checkpoint, "adapter_model.bin"
) # only LoRA model - LoRA config above has to fit
resume_from_checkpoint = (
False # So the trainer won't try loading its state
)
# The two files above have a different name depending on how they were saved, but are actually the same.
if os.path.exists(checkpoint_name):
print(f"Restarting from {checkpoint_name}")
adapters_weights = torch.load(checkpoint_name)
set_peft_model_state_dict(model, adapters_weights)
else:
print(f"Checkpoint {checkpoint_name} not found")
data = load_dataset("json", data_files=DATA_PATH)
def tokenize(prompt, add_eos_token=True):
result = tokenizer(
prompt,
truncation=True,
max_length=CUTOFF_LEN,
padding=False,
return_tensors=None,
)
if (
result["input_ids"][-1] != tokenizer.eos_token_id
and len(result["input_ids"]) < CUTOFF_LEN
and add_eos_token
):
result["input_ids"].append(tokenizer.eos_token_id)
result["attention_mask"].append(1)
if add_eos_token and len(result["input_ids"]) >= CUTOFF_LEN:
result["input_ids"][CUTOFF_LEN - 1] = tokenizer.eos_token_id
result["attention_mask"][CUTOFF_LEN - 1] = 1
result["labels"] = result["input_ids"].copy()
return result
def generate_and_tokenize_prompt(data_point):
instruction = data_point['instruction']
input_text = data_point["input"]
input_text = "Human: " + instruction + input_text + "\n\nAssistant: "
input_text = tokenizer.bos_token + input_text if tokenizer.bos_token != None else input_text
target_text = data_point["output"] + tokenizer.eos_token
full_prompt = input_text + target_text
tokenized_full_prompt = tokenize(full_prompt)
return tokenized_full_prompt
if VAL_SET_SIZE > 0:
train_val = data["train"].train_test_split(
test_size=VAL_SET_SIZE, shuffle=True, seed=42
)
train_data = train_val["train"].shuffle().map(generate_and_tokenize_prompt)
val_data = train_val["test"].shuffle().map(generate_and_tokenize_prompt)
else:
train_data = data['train'].shuffle().map(generate_and_tokenize_prompt)
val_data = None
trainer = Trainer(
model=model,
train_dataset=train_data,
eval_dataset=val_data,
args=TrainingArguments(
num_train_epochs=1,
per_device_train_batch_size=1,
per_device_eval_batch_size=1,
learning_rate=3e-4,
gradient_accumulation_steps=4,
evaluation_strategy="steps" if VAL_SET_SIZE > 0 else "no",
save_strategy="steps",
eval_steps=2000 if VAL_SET_SIZE > 0 else None,
save_steps=2000,
output_dir=OUTPUT_DIR,
report_to = "tensorboard",
save_total_limit=3,
load_best_model_at_end=True if VAL_SET_SIZE > 0 else False,
optim="adamw_torch"
),
data_collator=transformers.DataCollatorForSeq2Seq(tokenizer,
pad_to_multiple_of=8,
return_tensors="pt",
padding=True),
)
trainer.train(resume_from_checkpoint=False)
model.save_pretrained(OUTPUT_DIR)
|