In [None]:
!pip install -q peft transformers datasets huggingface_hub
!pip install flash-attn --no-build-isolation

In [20]:
from transformers import AutoModelForCausalLM, AutoTokenizer, default_data_collator, get_linear_schedule_with_warmup
from peft import get_peft_config, get_peft_model, PromptTuningInit, PromptTuningConfig, TaskType, PeftType
import torch
from datasets import load_dataset
import os
from torch.utils.data import DataLoader
from tqdm import tqdm
from huggingface_hub import notebook_login
from huggingface_hub import HfApi

In [None]:
notebook_login()

In [24]:
api = HfApi()
api.upload_file(path_or_fileobj='prompt_tune_phi3.ipynb',
                path_in_repo='prompt_tune_phi3.ipynb',
                repo_id='Granther/prompt-tuned-phi3',
                repo_type='model'
               )

CommitInfo(commit_url='https://huggingface.co/Granther/prompt-tuned-phi3/commit/912e66e469c6dd381daaa1ee25f5284e17c9377a', commit_message='Upload prompt_tune_phi3.ipynb with huggingface_hub', commit_description='', oid='912e66e469c6dd381daaa1ee25f5284e17c9377a', pr_url=None, pr_revision=None, pr_num=None)

In [6]:
device = 'cuda'

model_id = 'microsoft/Phi-3-mini-128k-instruct'

peft_conf = PromptTuningConfig(
    peft_type=PeftType.PROMPT_TUNING, # what kind of peft
    task_type=TaskType.CAUSAL_LM,     # config task
    prompt_tuning_init=PromptTuningInit.TEXT, # Set to 'TEXT' to use prompt_tuning_init_text
    num_virtual_tokens=8, # x times the number of hidden transformer layers
    prompt_tuning_init_text="Classify if the tweet is a complaint or not:",
    tokenizer_name_or_path=model_id
)

dataset_name = "twitter_complaints"
checkpoint_name = f"{dataset_name}_{model_id}_{peft_conf.peft_type}_{peft_conf.task_type}_v1.pt".replace(
    "/", "_"
)

text_col = 'Tweet text'
lab_col = 'text_label'
max_len = 64
lr = 3e-2
epochs = 50
batch_size = 8

In [7]:
dataset = load_dataset('ought/raft', dataset_name, split='train')

In [8]:
dataset.features['Label'].names
#>>> ['Unlabeled', 'complaint', 'no complaint']

['Unlabeled', 'complaint', 'no complaint']

In [11]:
# Create lambda function
classes = [k.replace('_', ' ') for k in dataset.features['Label'].names]
dataset = dataset.map(
    lambda x: {'text_label': [classes[label] for label in x['Label']]},
    batched=True,
    num_proc=10,
)

dataset[0]

Map (num_proc=10):   0%|          | 0/50 [00:00<?, ? examples/s]

'Unlabeled'

In [16]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

if tokenizer.pad_token_id == None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

target_max_len = max([len(tokenizer(class_lab)['input_ids']) for class_lab in classes])
target_max_len # max length for tokenized labels

tokenizer(classes[0])['input_ids'] 
# Ids corresponding to the tokens in the sequence
# Attention mask is a binary tensor used in the transformer block to differentiate between padding tokens and meaningful ones

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


[1, 853, 29880, 24025, 32000]

### Preprocess Function:
- Tokenize text and label
- Pad each example in the batch with tok.pad_token_id
- 