bigcode/starcoder · Tokenizer causes issues in Finetuning because of special tokens in tokenization <|X|>

Ill run through my setup and then get to the problem.

i am setting up config, tokenizer, model and peftmodel

from peft import LoraConfig, TaskType
import torch

CHATPATH = "/notebooks/starchat-beta"
BASEPATH = "/notebooks/starcoderplus"

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(DEVICE)

peftconfig = LoraConfig(
    CHATPATH,
    base_model_name_or_path = BASEPATH,
    task_type=TaskType.CAUSAL_LM,  
    target_modules = ["c_proj", "c_attn", "q_attn"],
    bias="none",
    inference_mode=False, 
    r=8, 
    lora_alpha=32, 
    lora_dropout=0.01
)

from transformers import AutoTokenizer

system_token = "<|system|>"
user_token = "<|user|>"
assistant_token = "<|assistant|>"
end_token = "<|end|>"

tokenizer = AutoTokenizer.from_pretrained(BASEPATH)
tokenizer.pad_token=tokenizer.eos_token
added_tokens = tokenizer.add_special_tokens({"additional_special_tokens": [system_token, user_token, assistant_token, end_token]})

print("tokenizer.vocab_size", tokenizer.vocab_size, added_tokens)
> tokenizer.vocab_size 49152 0

from transformers import AutoModelForCausalLM
import torch

model = AutoModelForCausalLM.from_pretrained(
    BASEPATH,
    torch_dtype=torch.bfloat16,
    device_map="auto",
).to(DEVICE)

freeze_model(model)

from peft import get_peft_model

peftmodel = get_peft_model(model, peftconfig)
peftmodel.resize_token_embeddings(len(tokenizer))

now we have the peftmodel and the tokenizer setup. check that even tho i add the special tokens, they dont get added.

i continue with setting up the data.

import pandas as pd
from datasets import Dataset

system_token = "<|system|>"
user_token = "<|user|>"
assistant_token = "<|assistant|>"
end_token = "<|end|>"
system_msg = "X"

def prepare_dialogue(row):
    # print(row)
    prompt = system_token + "\n" + system_msg + end_token + "\n"
    prompt += user_token + "\n" + row["prompt"] + end_token + "\n"
    prompt += assistant_token + "\n" + row["completion"] + end_token + "\n"
    row["dialogue"] = prompt
    return row

def strip_quotes(val): return val.strip('"') if isinstance(val, str) else val
def prepare_row(row):
    for col in row.index:
        row[col] = row[col].strip("'").strip("';")
    return prepare_dialogue(row)
def prepare_data(data):
    data.rename(columns={"'completion';": 'completion', "'prompt'": 'prompt'}, inplace=True)
    data = data.apply(prepare_row, axis=1)
    return data
def load_data(path):
    data = pd.read_csv(path, delimiter=";", quotechar="'",skipinitialspace=True)
    return Dataset.from_pandas(prepare_data(data))
    
trainingdata = load_data("./data/training.csv")
testingdata = load_data("./data/testing.csv")

def tokenize(batch):
    batch_dialogues = batch['dialogue']   # Fetch the 'dialogue' field
    tokenization = tokenizer(batch_dialogues, padding=True, return_token_type_ids=False)
    labels = tokenization.input_ids.copy()
    # mask_user_labels(tokenizer, labels) # not working.
    tokenization['labels'] = labels
    return tokenization

from datasets import DatasetDict
dataset = DatasetDict({
    'train': trainingdata.map(tokenize, batched=True),
    'test': testingdata.map(tokenize, batched=True)

})
for key in dataset:
    dataset[key] = dataset[key].remove_columns(['dialogue', 'completion', 'prompt'])

let me go through the important parts.
the prepare_dialogue function takes the data from my csv and formats it according to the dialogue template.
the tokenize function takes a batch, tokenizes them and adds them as labels to the dataset.

here is the crux of the matter.

print(dataset['train'])
print('torch max: ', torch.max(torch.tensor(dataset['train']["labels"])))

final_layer = list(peftmodel.modules())[-1]

if isinstance(final_layer, torch.nn.Linear):
    print(f"The output dimension is {final_layer.out_features}")
else:
    print("Final layer is not a Linear layer.")


> Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 228
})
> torch max:  tensor(49155)
> The output dimension is 49152

here is the problem.
the largest label_id found is 49155, but the output dimension is only 49152.

print("system_token_id:", tokenizer.convert_tokens_to_ids(system_token))
print("user_token_id:", tokenizer.convert_tokens_to_ids(user_token))
print("assistant_token_id:", tokenizer.convert_tokens_to_ids(assistant_token))
print("end_token_id:", tokenizer.convert_tokens_to_ids(end_token))

> system_token_id: 49152
> user_token_id: 49154
> assistant_token_id: 49153
> end_token_id: 49155

the added token are the difference.

what am i to do here?
training like this throws errors because of dimension mismatch.
not adding the token makes no sense as per the "documentation" or the code