# Lora exercise from transformers import AutoTokenizer, AutoModelForCausalLM from peft import LoraConfig,get_peft_model,PeftModel,PeftConfig from datasets import load_dataset import bitsandbytes as bnb import transformers import torch.nn as nn import torch # check if cuda is available device = 'cuda' if torch.cuda.is_available() else 'cpu' model = AutoModelForCausalLM.from_pretrained( 'bigscience/bloom-3b', torch_dtype=torch.float16, device_map='auto' ) tokenizer = AutoTokenizer.from_pretrained("bigscience/tokenizer") #print the model print(model) for param in model.parameters(): param.requires_grad=False if param.ndim == 1: # cast the small parameters to fp32 for stability param.data = param.data.to(torch.float32) model.gradient_checkpointing_enable() model.enable_input_require_grads() # The key takeaway here is that CastOutputToFloat is primarily designed to ensure the output of model.lm_head is in float32 format. class CastOutputToFloat(nn.Sequential): def forward(self,x): return super().forward(x).to(torch.float32) model.lm_head = CastOutputToFloat(model.lm_head) # print trainable parameters def print_trainable_parameters(model): trainable_params = 0 all_params = 0 for _, param in model.named_parameters(): all_params += param.numel() if param.requires_grad: trainable_params += param.numel() print(f'Trainable Params:{trainable_params}, All Params:{all_params}, trainable % {100 * (trainable_params/all_params)}') # obtain Lora config config = LoraConfig( r=8, lora_alpha=16, target_modules=['query_key_value'], lora_dropout=0.05, bias="none", task_type="CAUSAL_LM" ) # Find out the difference between pretrained model and the current model model = get_peft_model(model=model,peft_config=config) print_trainable_parameters(model=model) qa_dataset = load_dataset('squad_v2') def create_prompt(context, question, answer): result = "" if len(answer['text']) < 1: result = "I don't the answer" else: result = answer['text'][0] prompt_template = f"### CONTEXT\n{context}\n\n### QUESTION\n{question}\n\n### AMSWER\n{result}" return prompt_template mapped_dataset = qa_dataset.map(lambda samples: tokenizer(create_prompt(samples['context'],samples['question'],samples['answers']))) # Understand the parameters once again trainer = transformers.Trainer( model=model, train_dataset=mapped_dataset['train'], args=transformers.TrainingArguments( per_device_eval_batch_size=4, gradient_accumulation_steps=4, warmup_steps=100, max_steps=100, num_train_epochs=3, learning_rate=1e-3, fp16=True, logging_steps=1, output_dir='outputs' ), data_collator=transformers.DataCollatorForLanguageModeling(tokenizer,mlm=False) ) # WHat is the use of cache here. model.config.use_cache = False trainer.train() # Upload to hugging_face model_name = "bloom7b__finetune_sample" HUGGING_FACE_USER_NAME = "james92" model.push_to_hub(f"{HUGGING_FACE_USER_NAME}/{model_name}", use_auth_token=True) print("Model is saved in hggingface")