Hello,

I am trying to fine-tune JASMINE-350M for a chatbot. The fine tuning code works fine. However, when it comes to inference, I am getting the following error:

RuntimeError Traceback (most recent call last)
in <cell line: 43>()
46 break
47
---> 48 bot_response = chatbot_response(user_input)
49 print(f"JASMINE: {bot_response}")

3 frames
/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py in _sample(self, input_ids, logits_processor, stopping_criteria, generation_config, synced_gpus, streamer, **model_kwargs)
3247 probs = nn.functional.softmax(next_token_scores, dim=-1)
3248 # TODO (joao): this OP throws "skipping cudagraphs due to ['incompatible ops']", find solution
-> 3249 next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
3250 else:
3251 next_tokens = torch.argmax(next_token_scores, dim=-1)

RuntimeError: CUDA error: device-side assert triggered
Compile with TORCH_USE_CUDA_DSA to enable device-side assertions.

Please help me fix this issue or share with me code that works

Here is the complete code for your reference:

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import Dataset

Load model and tokenizer

model_name = "UBC-NLP/Jasmine-350M"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16,attn_implementation="eager").cuda()

Add special tokens for the chatbot conversation roles

special_tokens_dict = {"additional_special_tokens": ["<|USER|>", "<|BOT|>"]}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))

Convert the list to a Hugging Face Dataset

train_dataset_raw = Dataset.from_dict({"dialogue": train_dataset_raw})

Add the eos token if not already present in the tokenizer

eos_token = tokenizer.eos_token or "<|endoftext|>"

Preprocess function for structured data

def preprocess_function(examples):
conversations = []

# Iterate over each dialogue in the dataset
for dialogue in examples["dialogue"]:
    # Initialize an empty conversation string
    conversation = ""
    
    # Add each message to the conversation with alternating tags
    for i, message in enumerate(dialogue):
        if i % 2 == 0:  # Odd index in the loop (User message)
            conversation += f"<|USER|>\n{message}{eos_token}\n"
        else:            # Even index in the loop (Bot message)
            conversation += f"<|BOT|>\n{message}{eos_token}\n"
    
    # Append the fully formatted conversation to the list
    conversations.append(conversation)
inputs = tokenizer(conversations, truncation=True, padding="max_length", max_length=128)
inputs["labels"] = inputs["input_ids"].copy()  # Set labels equal to input_ids
return inputs

Ensure the tokenizer has a pad token; if not, set it to the eos_token

if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token

train_dataset = train_dataset_raw.map(preprocess_function, batched=True)

Training arguments

training_args = TrainingArguments(
output_dir="/home/output/",
overwrite_output_dir=True,
num_train_epochs=1,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
save_steps=500,
save_total_limit=2,
logging_dir="/home/logs/",
logging_steps=50,
report_to="none", # Disable W&B
)

Trainer setup

trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
)

#trainer.train(resume_from_checkpoint=True)
trainer.train()

Save the fine-tuned model and tokenizer

model.save_pretrained("/home/output/")
tokenizer.save_pretrained("/home/output/")

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

Load the fine-tuned model and tokenizer

model_path = "/home/output/"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path).cuda()
model.eval()

Set up conversation history

conversation_history = []

def chatbot_response(user_input):
# Append the user input to conversation history
conversation_history.append(f"<|USER|> {user_input}")

# Prepare the input for the model
conversation_text = " ".join(conversation_history) + " <|BOT|>"
inputs = tokenizer(conversation_text, return_tensors="pt").to("cuda")

# Error happens at this line
outputs = model.generate(
    inputs["input_ids"],
    max_new_tokens=50,
    pad_token_id=tokenizer.eos_token_id,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    temperature=0.7,
)

# Decode and print the response
response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
bot_response = response_text.split("<|BOT|>")[-1].strip()

# Append bot response to conversation history
conversation_history.append(f"<|BOT|> {bot_response}")

return bot_response

Interactive loop

print("JASMINE Chatbot is ready! Type 'exit' to end the conversation.")
while True:
user_input = input("User: ")
if user_input.lower() in ["exit", "مع السلامة"]:
break

bot_response = chatbot_response(user_input)
print(f"JASMINE: {bot_response}")

UBC-NLP
/

Jasmine-350M

Getting an error when trying to use a fine-tuned model from JASMINE-350M