Fine tuned Llama3 doesn't stop generating tokens when it should.

#142
by keyuisai - opened

Hello, I finetuned Meta-Llama-3-8B-Instruct model. I used required prompt template and added special tokens. When I use the fine tuned model for inferencing, it can generate the right answer, but won't stop right there. Instead it continues to generate a bunch of random texts. Not sure if I did anything wrong here.

def load_base_model(self):
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(
self.base_model,
quantization_config=bnb_config,
device_map=self.device_map,
attn_implementation='eager')

    self.tokenizer = AutoTokenizer.from_pretrained(self.base_model)
    # model, self.tokenizer = setup_chat_format(model, self.tokenizer)
    
    config = LoraConfig(
            r=self.lora_r,
            lora_alpha=self.lora_alpha,
            target_modules=self.target_modules,
            lora_dropout=self.lora_dropout,
            bias="none",
            task_type="CAUSAL_LM",
        )
    model = get_peft_model(model, config)
    print_parameter_status(model)
    
    return model, self.tokenizer

def start_finetune(self, model, train_data, val_data):
trainer = SFTTrainer(
model=model,
train_dataset=train_data,
eval_dataset=val_data,
args=transformers.TrainingArguments(
resume_from_checkpoint=None,
per_device_train_batch_size=self.params.get('micro_batch_size',10),
gradient_accumulation_steps=self.params.get('gradient_accumulation_steps',10),
warmup_steps=self.params.get('warmup_steps',5),
num_train_epochs=self.params.get('num_epochs',3),
learning_rate=self.params.get('learning_rate',2e-4),
fp16=self.params.get('fp16',False),
logging_steps=self.params.get('logging_steps',5),
optim=self.params.get('optim','adamw_torch'),
lr_scheduler_type=self.params.get('lr_scheduler_type','cosine') ,
evaluation_strategy="steps",
save_strategy="steps",
eval_steps=5,
save_steps=self.params.get(save_steps,5) ,
output_dir=self.params.get('cp_path') ,
save_total_limit=50,
load_best_model_at_end=True,
ddp_find_unused_parameters=None,
group_by_length=self.params.get('group_by_length',False) ,
report_to='tensorboard',
run_name=None,
),
peft_config = LoraConfig(
r=self.lora_r,
lora_alpha=self.lora_alpha,
target_modules=self.target_modules,
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM",
),
dataset_text_field = 'full_prompt',
max_seq_length = 256,
packing= False
)
model.config.use_cache = False
trainer.train()

train_js = load_dataset("json", data_files = f'interim_dataset/train_test_clockwork_v4/train_df.jsonl',split="all")
val_js = load_dataset("json", data_files = f'interim_dataset/train_test_clockwork_v4/val_df.jsonl',split="all")

def format_chat_template(row):
template = f"""<|start_header_id|>system<|end_header_id|> {row['instruction']}<|eot_id|><|start_header_id|>user<|end_header_id|> {row['input']}<|eot_id|><|start_header_id|>assistant<|end_header_id|> {row['output']}<|eot_id|>"""
row["full_prompt"] = template
return row

train_data = train_js.map(format_chat_template)
val_data = val_js.map(format_chat_template)

Inference:
def merge_model(base_model,new_model):
tokenizer = AutoTokenizer.from_pretrained(base_model)
base_model_reload = AutoModelForCausalLM.from_pretrained(
base_model,
return_dict=True,
low_cpu_mem_usage=True,
torch_dtype=torch.float16,
device_map="auto",
trust_remote_code=True,)
model = PeftModel.from_pretrained(base_model_reload, new_model)
model = model.merge_and_unload()
return model,tokenizer

model,tokenizer = merge_model(base_model_path,new_model_path)

def format_chat_template_inf(row):
template = f"<|start_header_id|>system<|end_header_id|> {row['instruction'].values[0]}<|eot_id|><|start_header_id|>user<|end_header_id|> {row['input'].values[0]}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""
return template

def inference(prompt,tokenizer,model):
inputs = tokenizer(prompt, return_tensors='pt',
truncation=True).to("cuda")
outputs = model.generate(**inputs, max_length=512,
num_return_sequences=1)
text = tokenizer.decode(outputs[0], skip_special_tokens=True)
return text.split("assistant")[1]

inference(prompt,tokenizer,model)

result: dismissal/nondiscrimination/child abuse/CSAM. RuntimeObject No this transaction note does not indicate a violation of the potential BRM category. RuntimeObject It is a donation note, which is not a BRM violation.<|eot_id|>'

(dismissal is the expected output).

Can anyone share some insights? I appreciate it.

I think you should set pad_token_id as tokenizer.eos_token_id.
generation_config = model.generation_config
generation_config.pad_token_id = tokenizer.eos_token_id
outputs = model.generate(**inputs, max_length=512, num_return_sequences=1, generation_config=generation_config)

I think you should set pad_token_id as tokenizer.eos_token_id.
generation_config = model.generation_config
generation_config.pad_token_id = tokenizer.eos_token_id
outputs = model.generate(**inputs, max_length=512, num_return_sequences=1, generation_config=generation_config)

thank you! It's included in the config but still having this issue.
Screenshot 2024-06-16 at 10.07.01 PM.png

Hi,

There was a bug in llama 3 that has since been fixed on May 13: https://github.com/unslothai/unsloth/issues/416#issuecomment-2143916779. However, if you're using llama 3 from hugging face but not from Meta, their version might have been fixed. You could try the one directly on Meta on hugging face.

If not, here is a workaround: https://github.com/unslothai/unsloth/issues/416#issuecomment-2094745798.

In the workaround, he didn't mention this, but after you merge the weights and want to predict with it, you need to switch the token side back to "left" and set the padding token back to eos token after loading the finetuned model and existing tokenizer.

Sign up or log in to comment