Based on [alpaca lora](https://github.com/tloen/alpaca-lora/blob/main/finetune.py).

In [None]:
# !apt update
# !apt upgrade -y cuda-nvcc-12-0

In [None]:
import torch
print("Torch Version: " + torch.__version__ + "\n")

# !nvcc --version

# !nvidia-smi

In [None]:
# !pip install -U cuda-python
# !pip3 install -U torch torchvision torchaudio #--index-url https://download.pytorch.org/whl/cu118

# # Paperspace
# !git clone https://github.com/timdettmers/bitsandbytes.git
# !cd bitsandbytes && CUDA_VERSION=116 make cuda11x && python setup.py install
# !cp /notebooks/bitsandbytes/bitsandbytes/libbitsandbytes_cuda116.so /usr/lib/python3.9/
# !pip install -U bitsandbytes

# # Google Colab
# #!pip install -U git+https://github.com/TimDettmers/bitsandbytes

# !pip install -U git+https://github.com/huggingface/transformers.git
# !pip install -U git+https://github.com/huggingface/peft.git
# !pip install -U datasets accelerate

In [None]:
#!find / -name bitsandbytes

#!find / -name libbitsandbytes_cuda116.so

#!cp /notebooks/bitsandbytes/bitsandbytes/libbitsandbytes_cuda116.so /usr/lib/python3.9/

#!ls /usr/lib/python3.9/

#!python -m bitsandbytes

In [None]:
import os

# To choose a specific GPU:
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import torch
import torch.nn as nn
import bitsandbytes as bnb
from datasets import load_dataset
import transformers
from transformers import AutoTokenizer, AutoConfig, LlamaForCausalLM, LlamaTokenizer, AutoModelForCausalLM
from peft import prepare_model_for_kbit_training, prepare_model_for_int8_training, LoraConfig, get_peft_model
from peft.peft_model import PeftModel


In [None]:
MICRO_BATCH_SIZE = 6 # this could actually be 5 but i like powers of 2
BATCH_SIZE = 128
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
EPOCHS = 2
LEARNING_RATE = 3e-4 # the Karpathy constant
CUTOFF_LEN = 256
LORA_R = 8
LORA_ALPHA = 16
LORA_DROPOUT = 0.05

In [None]:
from huggingface_hub import snapshot_download

model = '''openlm-research/open_llama_3b_v2'''
"""VMware/open-llama-13b-open-instruct"""
use_fast_tokenizer=False
# snapshot_download(repo_id=model)

# LlamaTokenizer, is faster, if model is Llama
# tokenizer = LlamaTokenizer.from_pretrained(model, use_fast=use_fast_tokenizer)
# For other models:
tokenizer = AutoTokenizer.from_pretrained(model, use_fast=use_fast_tokenizer)

# model = LlamaForCausalLM.from_pretrained(model, load_in_8bit=True, low_cpu_mem_usage=True, device_map='auto', torch_dtype=torch.float16)
# For other models:
model = AutoModelForCausalLM.from_pretrained(model, load_in_8bit=True, low_cpu_mem_usage=True, device_map='auto', torch_dtype=torch.float16)

In [None]:
model = prepare_model_for_int8_training(model)

config = LoraConfig(
 r=LORA_R,
 lora_alpha=LORA_ALPHA,
 target_modules=["q_proj", "v_proj"],
 lora_dropout=LORA_DROPOUT,
 bias="none",
 task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)
# model = PeftModel.from_pretrained(model, "open-llama-3bv2-lora-cabra-adapter-120steps", config=config)
tokenizer.pad_token_id = 0 # unk. we want this to be different from the eos token
data = load_dataset("json", data_files="https://huggingface.co/datasets/Gustrd/dolly-15k-libretranslate-pt/resolve/main/dolly-15k-libretranslate-pt.json")

In [None]:
import math

# Create a slice of the dataset to handle time constraints
# Calculate the number of rows to select for 1/2 of the data
dataSliceNumber = 1
num_rows = math.ceil(len(data['train']) // (1/dataSliceNumber))
data['train'] = data['train'].shuffle().select(range(num_rows))

In [None]:
def generate_prompt(data_point):
 # desculpe o desastre de formatação, preciso ser rápido
 if data_point["context"]:
 return f"""Abaixo está uma instrução que descreve uma tarefa, juntamente com uma entrada que fornece mais contexto. Escreva uma resposta que complete adequadamente o pedido.
### Instrução:
{data_point["instruction"]}
### Entrada:
{data_point["context"]}
### Resposta:
{data_point["response"]}"""
 else:
 return f"""Abaixo está uma instrução que descreve uma tarefa. Escreva uma resposta que complete adequadamente o pedido.
### Instrução:
{data_point["instruction"]}
### Resposta:
{data_point["response"]}"""

def tokenize(prompt):
 # there's probably a way to do this with the tokenizer settings
 # but again, gotta move fast
 result = tokenizer(
 prompt,
 truncation=True,
 max_length=CUTOFF_LEN + 1,
 padding="max_length",
 )
 return {
 "input_ids": result["input_ids"][:-1],
 "attention_mask": result["attention_mask"][:-1],
 }

In [None]:
data = data.shuffle().map(lambda x: tokenize(generate_prompt(x)))

In [None]:
trainer = transformers.Trainer(
 model=model,
 train_dataset=data["train"],
 args=transformers.TrainingArguments(
 per_device_train_batch_size=MICRO_BATCH_SIZE,
 gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
 warmup_steps=100,
 num_train_epochs=EPOCHS,
 learning_rate=LEARNING_RATE,
 fp16=True,
 logging_steps=20,
 output_dir="lora-cabra-3Bv2",
 save_total_limit=4, 
 save_steps=20
 ),
 data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False



In [None]:

trainer.train(resume_from_checkpoint=True)

In [None]:
model.save_pretrained("open-llama-3bv2-lora-cabra-adapter-140steps")

In [None]:
# !tar -czvf open-llama-13b-lora-cabra-adapter.tar.gz ./open-llama-13b-lora-cabra-adapter