In [None]:
!pip install accelerate transformers einops datasets peft bitsandbytes torch

Collecting accelerate
 Downloading accelerate-0.27.2-py3-none-any.whl.metadata (18 kB)
Collecting transformers
 Downloading transformers-4.37.2-py3-none-any.whl.metadata (129 kB)
[2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.4/129.4 kB[0m [31m65.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting einops
 Downloading einops-0.7.0-py3-none-any.whl.metadata (13 kB)
Collecting datasets
 Downloading datasets-2.17.1-py3-none-any.whl.metadata (20 kB)
Collecting peft
 Downloading peft-0.8.2-py3-none-any.whl.metadata (25 kB)
Collecting bitsandbytes
 Downloading bitsandbytes-0.42.0-py3-none-any.whl.metadata (9.9 kB)
Collecting torch
 Downloading torch-2.2.0-cp311-cp311-manylinux1_x86_64.whl.metadata (25 kB)
Collecting huggingface-hub (from accelerate)
 Downloading huggingface_hub-0.20.3-py3-none-any.whl.metadata (12 kB)
Collecting safetensors>=0.3.1 (from accelerate)
 Downloading safetensors-0.4.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collect

In [None]:
import torch
from datasets import load_dataset, Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
import os

In [None]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
bnb_config = BitsAndBytesConfig(
 load_in_4bit=True,
 bnb_4bit_use_double_quant=True,
 bnb_4bit_quant_type="nf4",
 bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
 "microsoft/phi-2",
 device_map={"":0},
 trust_remote_code=True,
 quantization_config=bnb_config
)

In [None]:
print(model)

In [None]:
# config = LoraConfig(
# r=16,
# lora_alpha=16,
# #target_modules=["q_proj","k_proj","v_proj"],
# lora_dropout=0.05,
# bias="none",
# task_type="CAUSAL_LM"
# )

# model = get_peft_model(model, config)
# model.print_trainable_parameters()

In [None]:
def tokenize(sample):
 model_inps = tokenizer(sample["text"], padding=True, truncation=True, max_length=512)
 return model_inps

In [None]:
!pip install scikit-learn

In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split
dataset_name='data.csv'
df = pd.read_csv(dataset_name)
#train, test = train_test_split(df, test_size=0.2)

In [None]:
data_df = df
data_df["text"] = data_df[["user", "assistant"]].apply(lambda x: "question: " + str(x["user"]) + " answer: " + str(x["assistant"]), axis=1)
data = Dataset.from_pandas(data_df)
tokenized_data = data.map(tokenize, batched=True, desc="Tokenizing data", remove_columns=data.column_names)
tokenized_data

In [None]:
training_arguments = TrainingArguments(
 output_dir=".",
 per_device_train_batch_size=4,
 gradient_accumulation_steps=1,
 learning_rate=2e-4,
 lr_scheduler_type="cosine",
 save_strategy="epoch",
 logging_steps=50,
 max_steps=10000,
 num_train_epochs=3,
 push_to_hub=True
 )

In [None]:
!pip install huggingface_hub

In [None]:
pip install --upgrade jupyterlab jupyterlab-git

In [None]:
from huggingface_hub import interpreter_login
interpreter_login()

In [None]:
 !pip install torch==2.1.0 pytorch-lightning==1.9.4 accelerate==0.21.0 tokenizers==0.13.3 transformers

In [None]:
model.enable_input_require_grads()

In [None]:
trainer = Trainer(
 model=model,
 train_dataset=tokenized_data,
 args=training_arguments,
 data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
trainer.train()

In [None]:
from peft import PeftModel
from transformers import AutoModelForCausalLM
import torch
model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2", trust_remote_code=True, torch_dtype=torch.float32)
# peft_model = PeftModel.from_pretrained(model, "aissatoubalde/lab", from_transformers=True)
# model = peft_model.merge_and_unload()
# model

In [None]:
model.push_to_hub('aissatoubalde/lab')