--- base_model: unsloth/Meta-Llama-3.1-8B-bnb-4bit language: - en license: apache-2.0 tags: - text-generation-inference - transformers - unsloth - llama - trl - sft --- # Uploaded model - **Developed by:** vakodiya - **License:** apache-2.0 - **Finetuned from model :** unsloth/Meta-Llama-3.1-8B-bnb-4bit This llama model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library. [](https://github.com/unslothai/unsloth) # Code To Train Model on Google collab: # Installing required packages ``` %%capture !pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" from torch import __version__; from packaging.version import Version as V xformers = "xformers==0.0.27" if V(__version__) < V("2.4.0") else "xformers" !pip install --no-deps {xformers} trl peft accelerate bitsandbytes triton ``` # importing required modules ``` import torch from trl import SFTTrainer from datasets import load_dataset from transformers import TrainingArguments, TextStreamer from unsloth.chat_templates import get_chat_template from unsloth import FastLanguageModel, is_bfloat16_supported ``` # Login to HuggingFace using edit Access token storing in secrets ``` from huggingface_hub import login from google.colab import userdata hf_token = userdata.get('HF_API_KEY') login(token = hf_token) ``` # Check if a GPU is available ``` import torch if torch.cuda.is_available(): device = torch.device("cuda") print("GPU is available and being used.") else: device = torch.device("cpu") print("GPU is not available, using CPU.") ``` # Loading model from Hugging Face ``` max_seq_length = 1024 model, tokenizer = FastLanguageModel.from_pretrained( model_name="unsloth/Meta-Llama-3.1-8B-bnb-4bit", max_seq_length=max_seq_length, load_in_4bit=True, dtype=None, ) model = FastLanguageModel.get_peft_model( model, r=16, lora_alpha=16, lora_dropout=0, target_modules=["q_proj", "k_proj", "v_proj", "up_proj", "down_proj", "o_proj", "gate_proj"], use_rslora=True, use_gradient_checkpointing="unsloth" ) ``` # loading and formating Dataset ``` raw_dataset = load_dataset("viber1/indian-law-dataset", split="train[:1000]") # Define a simple prompt template using only Instruction and Response alpaca_prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request. ### Instruction: {} ### Response: {}""" # EOS token for marking the end of each example EOS_TOKEN = tokenizer.eos_token # Function to format prompts with only Instruction and Response def formatting_prompts_func(examples): Instruction = examples["Instruction"] Response = examples["Response"] # Create a formatted text for each example texts = [] for Instruction, Response in zip(Instruction, Response): # Format the text with the prompt template and add the EOS token text = alpaca_prompt.format(Instruction, Response) + EOS_TOKEN texts.append(text) return {"text": texts} # Apply the formatting function to the dataset dataset = raw_dataset.map(formatting_prompts_func, batched=True) ``` # Using Trainer with low batch sizes, Gradient Checkpointing, LoRA and Quantization ``` trainer=SFTTrainer( model=model, tokenizer=tokenizer, train_dataset=dataset, dataset_text_field="text", max_seq_length=max_seq_length, dataset_num_proc=2, packing=True, args=TrainingArguments( learning_rate=3e-4, lr_scheduler_type="linear", per_device_train_batch_size=1, gradient_accumulation_steps=1, gradient_checkpointing=True, num_train_epochs=1, fp16=not is_bfloat16_supported(), bf16=is_bfloat16_supported(), logging_steps=1, optim="adamw_8bit", weight_decay=0.01, warmup_steps=10, output_dir="output", seed=0, ), ) ``` # Show current memory stats ``` gpu_stats = torch.cuda.get_device_properties(0) start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3) max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3) print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.") print(f"{start_gpu_memory} GB of memory reserved.") ``` # Start Training ``` trainer_stats = trainer.train() ``` # Show final memory and time stats ``` used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3) used_memory_for_lora = round(used_memory - start_gpu_memory, 3) used_percentage = round(used_memory /max_memory*100, 3) lora_percentage = round(used_memory_for_lora/max_memory*100, 3) print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.") print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.") print(f"Peak reserved memory = {used_memory} GB.") print(f"Peak reserved memory for training = {used_memory_for_lora} GB.") print(f"Peak reserved memory % of max memory = {used_percentage} %.") print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.") ``` # Finally Saving Trained model and push to HuggingFace ``` # Merge to 16bit model.save_pretrained_merged("Indian-Law-Llama-3.1-8B", tokenizer, save_method = "merged_16bit",) model.push_to_hub_merged("vakodiya/Viber-Indian-Law-Unsloth-Llama-3.1-8B", tokenizer, save_method="merged_16bit", token = hf_token) ``` # Model usage with streaming response ``` # alpaca_prompt = Copied from above FastLanguageModel.for_inference(model) # Enable native 2x faster inference inputs = tokenizer( [ alpaca_prompt.format( "What is the difference between a petition and a plaint in Indian law?",'' ) ], return_tensors = "pt").to("cuda") from transformers import TextStreamer text_streamer = TextStreamer(tokenizer) _ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128) ```