import torch from datasets import load_dataset from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig from transformers import AutoTokenizer, AutoModelForCausalLM, AdamW, get_linear_schedule_with_warmup, BitsAndBytesConfig import transformers import warnings warnings.filterwarnings("ignore") base_model_id= "google/gemma-2b" torch.cuda.set_device(0) device = "cuda:0" if torch.cuda.is_available() else "cpu" print("Using device:", device) # Load the jokes dataset dataset = load_dataset("ysharma/short_jokes") # Accessing the train split train_data = dataset['train'] # Shuffle the dataset and select 20% of the data twenty_percent_size = int(0.2 * len(train_data)) subset = train_data.shuffle(seed=42)[:twenty_percent_size] import torch print("Available devices:", torch.cuda.device_count()) print("Current device:", torch.cuda.current_device()) #accelerate from accelerate import FullyShardedDataParallelPlugin, Accelerator from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig fsdp_plugin = FullyShardedDataParallelPlugin( state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False), optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False), ) tokenizer = AutoTokenizer.from_pretrained( base_model_id, padding_side="left", add_eos_token=True, add_bos_token=True, ) tokenizer.pad_token = tokenizer.eos_token # def formatting_func(example): # text = f"### The following is a note by Eevee the Dog: {example['note']}" # return text # def generate_and_tokenize_prompt(prompt): # return tokenizer(formatting_func(prompt)) bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16 ) model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config, device_map="auto") def tokenize_function(examples): return tokenizer(examples["Joke"], padding="max_length", truncation=True, max_length=50) from datasets import load_dataset dataset = load_dataset("ysharma/short_jokes") # Shuffle the dataset and select 20% of the data # print("train_data ",train_data, "subset ",subset) train_test_split = dataset['train'].train_test_split(test_size=0.1) train_data = train_test_split['train'] test_data = train_test_split['test'] # Now, tokenize the newly split datasets tokenized_train_data = train_data.map(tokenize_function, batched=True) tokenized_test_data = test_data.map(tokenize_function, batched=True) eval_prompt = " why man are " eval_tokenizer = AutoTokenizer.from_pretrained( base_model_id, add_bos_token=True, ) model_input = eval_tokenizer(eval_prompt, return_tensors="pt").to("cuda") model.eval() with torch.no_grad(): print(eval_tokenizer.decode(model.generate(**model_input, max_new_tokens=50, repetition_penalty=1.15)[0], skip_special_tokens=True)) from peft import prepare_model_for_kbit_training,LoraConfig, get_peft_model model.gradient_checkpointing_enable() model = prepare_model_for_kbit_training(model) def print_trainable_parameters(model): """ Prints the number of trainable parameters in the model. """ trainable_params = 0 all_param = 0 for _, param in model.named_parameters(): all_param += param.numel() if param.requires_grad: trainable_params += param.numel() print( f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}" ) config = LoraConfig( r=32, lora_alpha=64, target_modules=[ "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", "lm_head", ], bias="none", lora_dropout=0.05, # Conventional task_type="CAUSAL_LM", ) model = get_peft_model(model, config) print_trainable_parameters(model) # if torch.cuda.device_count() > 1: # If more than 1 GPU # model.is_parallelizable = True # model.model_parallel = True model.to(device) accelerator = Accelerator(fsdp_plugin=fsdp_plugin) print("Accelerator device:", accelerator.device) model = accelerator.prepare_model(model) from datetime import datetime project = "jokes-gemma" base_model_name = "gemma" run_name = base_model_name + "-" + project output_dir = "./" + run_name trainer = transformers.Trainer( model=model, train_dataset=tokenized_train_data, eval_dataset=tokenized_test_data, args=transformers.TrainingArguments( output_dir=output_dir, warmup_steps=1, per_device_train_batch_size=2, gradient_accumulation_steps=1, gradient_checkpointing=True, max_steps=500, learning_rate=2.5e-5, # Want a small lr for finetuning bf16=True, optim="paged_adamw_8bit", logging_steps=25, # When to start reporting loss logging_dir="./logs", # Directory for storing logs save_strategy="steps", # Save the model checkpoint every logging step save_steps=25, # Save checkpoints every 50 steps evaluation_strategy="steps", # Evaluate the model every logging step eval_steps=25, # Evaluate and save checkpoints every 50 steps do_eval=True, # Perform evaluation at the end of training report_to="wandb", # Comment this out if you don't want to use weights & baises run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}" # Name of the W&B run (optional) ), data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False), ) model.config.use_cache = False # silence the warnings. Please re-enable for inference! trainer.train()