# -*- coding: utf-8 -*- """Mistral_fine-tuning.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/12WLPo_Aicjhp7sy8zbZmfg5behAZJVYN """ !pip install -q -U bitsandbytes !pip install -q -U git+https://github.com/huggingface/transformers.git !pip install -q -U git+https://github.com/huggingface/peft.git !pip install -q -U git+https://github.com/huggingface/accelerate.git !pip install -q -U datasets scipy ipywidgets matplotlib from datasets import load_dataset dataset = load_dataset("Devdeshitha/testdataset1") from datasets import load_dataset train_dataset = load_dataset('json', data_files='notes.jsonl', split='train') eval_dataset = load_dataset('json', data_files='notes_validation.jsonl', split='train') def formatting_func(example): text = f"### Question: {example['input']}\n ### Answer: {example['output']}" return text import torch from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig base_model_id = "mistralai/Mistral-7B-v0.1" bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16 ) model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config) tokenizer = AutoTokenizer.from_pretrained( base_model_id, padding_side="left", add_eos_token=True, add_bos_token=True, ) tokenizer.pad_token = tokenizer.eos_token def generate_and_tokenize_prompt(prompt): return tokenizer(formatting_func(prompt)) tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt) tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt) max_length = 1024 #This was an appropriate max length for my dataset def generate_and_tokenize_prompt2(prompt): result = tokenizer( formatting_func(prompt), truncation=True, max_length=max_length, padding="max_length", ) result["labels"] = result["input_ids"].copy() return result tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt2) tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt2) print(tokenized_train_dataset[1]['input_ids']) eval_prompt = "How does consolidating credit cards into a loan work?: " # Re-init the tokenizer so it doesn't add padding or eos token tokenizer = AutoTokenizer.from_pretrained( base_model_id, add_bos_token=True, ) model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda") model.eval() with torch.no_grad(): print(tokenizer.decode(model.generate(**model_input, max_new_tokens=512, repetition_penalty=1.15)[0], skip_special_tokens=True)) """Set Up LoRA Now, to start our fine-tuning, we have to apply some preprocessing to the model to prepare it for training. For that use the prepare_model_for_kbit_training method from PEFT. """