lora / lora_bloom_3b.py
james92's picture
james92/llama7b__finetune_sample
a9b7423
raw
history blame contribute delete
No virus
3.41 kB
# Lora exercise
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig,get_peft_model,PeftModel,PeftConfig
from datasets import load_dataset
import bitsandbytes as bnb
import transformers
import torch.nn as nn
import torch
# check if cuda is available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = AutoModelForCausalLM.from_pretrained(
'bigscience/bloom-3b',
torch_dtype=torch.float16,
device_map='auto'
)
tokenizer = AutoTokenizer.from_pretrained("bigscience/tokenizer")
#print the model
print(model)
# freezing the parameter
for param in model.parameters():
param.requires_grad=False
if param.ndim == 1: # cast the small parameters to fp32 for stability
param.data = param.data.to(torch.float32)
model.gradient_checkpointing_enable()
model.enable_input_require_grads()
# The key takeaway here is that CastOutputToFloat is primarily designed to ensure the output of model.lm_head is in float32 format.
class CastOutputToFloat(nn.Sequential):
def forward(self,x): return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.lm_head)
# print trainable parameters
def print_trainable_parameters(model):
trainable_params = 0
all_params = 0
for _, param in model.named_parameters():
all_params += param.numel()
if param.requires_grad:
trainable_params += param.numel()
print(f'Trainable Params:{trainable_params}, All Params:{all_params}, trainable % {100 * (trainable_params/all_params)}')
# obtain Lora config
config = LoraConfig(
r=8,
lora_alpha=16,
target_modules=['query_key_value'],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM"
)
# Find out the difference between pretrained model and the current model
model = get_peft_model(model=model,peft_config=config)
print_trainable_parameters(model=model)
qa_dataset = load_dataset('squad_v2')
def create_prompt(context, question, answer):
result = ""
if len(answer['text']) < 1:
result = "I don't the answer"
else:
result = answer['text'][0]
prompt_template = f"### CONTEXT\n{context}\n\n### QUESTION\n{question}\n\n### AMSWER\n{result}</s>"
return prompt_template
mapped_dataset = qa_dataset.map(lambda samples: tokenizer(create_prompt(samples['context'],samples['question'],samples['answers'])))
# Understand the parameters once again
trainer = transformers.Trainer(
model=model,
train_dataset=mapped_dataset['train'],
args=transformers.TrainingArguments(
per_device_eval_batch_size=4,
gradient_accumulation_steps=4,
warmup_steps=100,
max_steps=100,
num_train_epochs=3,
learning_rate=1e-3,
fp16=True,
logging_steps=1,
output_dir='outputs'
),
data_collator=transformers.DataCollatorForLanguageModeling(tokenizer,mlm=False)
)
# WHat is the use of cache here.
model.config.use_cache = False
trainer.train()
# Upload to hugging_face
model_name = "bloom7b__finetune_sample"
HUGGING_FACE_USER_NAME = "james92"
model.push_to_hub(f"{HUGGING_FACE_USER_NAME}/{model_name}", use_auth_token=True)
print("Model is saved in hggingface")