## Load the Dataset

In [1]:
# Opening modified dataset uploaded to HF
from datasets import load_dataset

dataset_name = "RaviNaik/oasst1-chatml"
dataset = load_dataset(dataset_name, split="train")

## Load the Model, Tokenizer and configure bnb

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer

In [3]:
model_name = "microsoft/phi-2"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    trust_remote_code=True,
    device_map="cuda:0"
)
model.config.use_cache = False

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, device_map="cuda:0")
tokenizer.pad_token = tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Display Model Layers

In [5]:
model

PhiForCausalLM(
  (transformer): PhiModel(
    (embd): Embedding(
      (wte): Embedding(51200, 2560)
      (drop): Dropout(p=0.0, inplace=False)
    )
    (h): ModuleList(
      (0-31): 32 x ParallelBlock(
        (ln): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.1, inplace=False)
        (mixer): MHA(
          (rotary_emb): RotaryEmbedding()
          (Wqkv): Linear4bit(in_features=2560, out_features=7680, bias=True)
          (out_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (inner_attn): SelfAttention(
            (drop): Dropout(p=0.0, inplace=False)
          )
          (inner_cross_attn): CrossAttention(
            (drop): Dropout(p=0.0, inplace=False)
          )
        )
        (mlp): MLP(
          (fc1): Linear4bit(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear4bit(in_features=10240, out_features=2560, bias=True)
          (act): NewGELUActivation()
        )
      )

## Configure LoRA for finetuning

In [6]:
from peft import LoraConfig

In [7]:
lora_alpha = 16
lora_dropout = 0.1
lora_r = 64

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "Wqkv",
        "out_proj",
        "fc1",
        "fc2",
    ]
)

## Configure Training Params

In [8]:
from transformers import TrainingArguments

In [9]:
output_dir = "./results"
per_device_train_batch_size = 4
gradient_accumulation_steps = 4
optim = "paged_adamw_32bit"
save_steps = 100
logging_steps = 10
learning_rate = 2e-4
max_grad_norm = 0.3
max_steps = 500
warmup_ratio = 0.03
lr_scheduler_type = "constant"

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    fp16=True,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
    lr_scheduler_type=lr_scheduler_type,
    gradient_checkpointing=False,
)

In [10]:
from trl import SFTTrainer

In [11]:
max_seq_length = 256

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
)

In [12]:
for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.float32).to("cuda:0")

## Begin Training

In [13]:
trainer.train()

You're using a CodeGenTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
10,1.7208
20,1.5483
30,1.5507
40,1.5435
50,1.5066
60,1.5014
70,1.5596
80,1.5529
90,1.5063
100,1.4717




config.json:   0%|          | 0.00/866 [00:00<?, ?B/s]

You are using a model of type phi to instantiate a model of type phi-msft. This is not supported for all configurations of models and can yield errors.
You are using a model of type phi to instantiate a model of type phi-msft. This is not supported for all configurations of models and can yield errors.
You are using a model of type phi to instantiate a model of type phi-msft. This is not supported for all configurations of models and can yield errors.
You are using a model of type phi to instantiate a model of type phi-msft. This is not supported for all configurations of models and can yield errors.


TrainOutput(global_step=500, training_loss=1.4746462078094482, metrics={'train_runtime': 4307.6684, 'train_samples_per_second': 3.714, 'train_steps_per_second': 0.116, 'total_flos': 6.667526640623616e+16, 'train_loss': 1.4746462078094482, 'epoch': 1.62})

## Inference

In [15]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)

In [16]:
chat_template = """<|im_start|>system
You are a helpful assistant who always respond to user queries<|im_end|>
<im_start>user
{prompt}<|im_end|>
<|im_start|>assistant
"""

In [17]:
prompt = "What is a large language model?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(chat_template.format(prompt=prompt))
print(result[0]['generated_text'])



<|im_start|>system
You are a helpful assistant who always respond to user queries<|im_end|>
<im_start>user
What is a large language model?<|im_end|>
<|im_start|>assistant
A large language model (LLM) is a type of artificial intelligence model that is designed to understand and generate human language. LLMs are typically trained on large amounts of text data and are capable of performing a wide range of language-related tasks, such as language translation, text summarization, and text generation.

LLMs are different from traditional language models, such as recurrent neural networks (RNNs) or transformers, in that they are designed to handle large amounts of text data and are able to learn from this data in a more efficient and effective way. This makes them well-suited for tasks that require a deep understanding of language, such as natural language processing (N


In [18]:
prompt = "Write a Python program to print first 50 prime numbers"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(chat_template.format(prompt=prompt))
print(result[0]['generated_text'])

<|im_start|>system
You are a helpful assistant who always respond to user queries<|im_end|>
<im_start>user
Write a Python program to print first 50 prime numbers<|im_end|>
<|im_start|>assistant
Here's a Python program that prints the first 50 prime numbers:

```python
def is_prime(n):
    if n < 2:
        return False
    for i in range(2, int(n**0.5) + 1):
        if n % i == 0:
            return False
    return True

count = 0
num = 2
while count < 50:
    if is_prime(num):
        print(num)
        count += 1
    num += 1
```

This program uses a helper function `is_prime` to check if a number is prime


In [19]:
prompt = "Can you write a short introduction about the relevance of the term monopsony in economics?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(chat_template.format(prompt=prompt))
print(result[0]['generated_text'])

<|im_start|>system
You are a helpful assistant who always respond to user queries<|im_end|>
<im_start>user
Can you write a short introduction about the relevance of the term monopsony in economics?<|im_end|>
<|im_start|>assistant
Sure, I'd be happy to help!

Monopsony is a term used in economics to describe a market structure in which there is only one buyer for a particular good or service. This means that the buyer has significant bargaining power and can set the price at which they will purchase the good or service.

In a monopsony, the buyer has the ability to influence the price of the good or service, which can have significant implications for the market. For example, if the buyer is a large corporation, they may be able to drive down the price of the good or service, which can have negative effects on the


In [20]:
trainer.save_model("checkpoints")

You are using a model of type phi to instantiate a model of type phi-msft. This is not supported for all configurations of models and can yield errors.


In [21]:
trainer.hub_model_id = "RaviNaik/Phi2-OSST"
trainer.push_to_hub()

You are using a model of type phi to instantiate a model of type phi-msft. This is not supported for all configurations of models and can yield errors.


Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

events.out.tfevents.1704991189.si-aiwork2.2021276.0:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/336M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.66k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/RaviNaik/Phi2-Osst/commit/987f0bdcb557c8cb356eb79452181f2944c21f8b', commit_message='End of training', commit_description='', oid='987f0bdcb557c8cb356eb79452181f2944c21f8b', pr_url=None, pr_revision=None, pr_num=None)