Fine tuned Llama3 doesn't stop generating tokens when it should.

#142

by keyuisai - opened Jun 13

Jun 13

Hello, I finetuned Meta-Llama-3-8B-Instruct model. I used required prompt template and added special tokens. When I use the fine tuned model for inferencing, it can generate the right answer, but won't stop right there. Instead it continues to generate a bunch of random texts. Not sure if I did anything wrong here.

def load_base_model(self):
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(
self.base_model,
quantization_config=bnb_config,
device_map=self.device_map,
attn_implementation='eager')

    self.tokenizer = AutoTokenizer.from_pretrained(self.base_model)
    # model, self.tokenizer = setup_chat_format(model, self.tokenizer)
    
    config = LoraConfig(
            r=self.lora_r,
            lora_alpha=self.lora_alpha,
            target_modules=self.target_modules,
            lora_dropout=self.lora_dropout,
            bias="none",
            task_type="CAUSAL_LM",
        )
    model = get_peft_model(model, config)
    print_parameter_status(model)
    
    return model, self.tokenizer

def start_finetune(self, model, train_data, val_data):
trainer = SFTTrainer(
model=model,
train_dataset=train_data,
eval_dataset=val_data,
args=transformers.TrainingArguments(
resume_from_checkpoint=None,
per_device_train_batch_size=self.params.get('micro_batch_size',10),
gradient_accumulation_steps=self.params.get('gradient_accumulation_steps',10),
warmup_steps=self.params.get('warmup_steps',5),
num_train_epochs=self.params.get('num_epochs',3),
learning_rate=self.params.get('learning_rate',2e-4),
fp16=self.params.get('fp16',False),
logging_steps=self.params.get('logging_steps',5),
optim=self.params.get('optim','adamw_torch'),
lr_scheduler_type=self.params.get('lr_scheduler_type','cosine') ,
evaluation_strategy="steps",
save_strategy="steps",
eval_steps=5,
save_steps=self.params.get(save_steps,5) ,
output_dir=self.params.get('cp_path') ,
save_total_limit=50,
load_best_model_at_end=True,
ddp_find_unused_parameters=None,
group_by_length=self.params.get('group_by_length',False) ,
report_to='tensorboard',
run_name=None,
),
peft_config = LoraConfig(
r=self.lora_r,
lora_alpha=self.lora_alpha,
target_modules=self.target_modules,
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM",
),
dataset_text_field = 'full_prompt',
max_seq_length = 256,
packing= False
)
model.config.use_cache = False
trainer.train()

train_js = load_dataset("json", data_files = f'interim_dataset/train_test_clockwork_v4/train_df.jsonl',split="all")
val_js = load_dataset("json", data_files = f'interim_dataset/train_test_clockwork_v4/val_df.jsonl',split="all")

def format_chat_template(row):
template = f"""<|start_header_id|>system<|end_header_id|> {row['instruction']}<|eot_id|><|start_header_id|>user<|end_header_id|> {row['input']}<|eot_id|><|start_header_id|>assistant<|end_header_id|> {row['output']}<|eot_id|>"""
row["full_prompt"] = template
return row

train_data = train_js.map(format_chat_template)
val_data = val_js.map(format_chat_template)

Inference:
def merge_model(base_model,new_model):
tokenizer = AutoTokenizer.from_pretrained(base_model)
base_model_reload = AutoModelForCausalLM.from_pretrained(
base_model,
return_dict=True,
low_cpu_mem_usage=True,
torch_dtype=torch.float16,
device_map="auto",
trust_remote_code=True,)
model = PeftModel.from_pretrained(base_model_reload, new_model)
model = model.merge_and_unload()
return model,tokenizer

model,tokenizer = merge_model(base_model_path,new_model_path)

def format_chat_template_inf(row):
template = f"<|start_header_id|>system<|end_header_id|> {row['instruction'].values[0]}<|eot_id|><|start_header_id|>user<|end_header_id|> {row['input'].values[0]}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""
return template

def inference(prompt,tokenizer,model):
inputs = tokenizer(prompt, return_tensors='pt',
truncation=True).to("cuda")
outputs = model.generate(**inputs, max_length=512,
num_return_sequences=1)
text = tokenizer.decode(outputs[0], skip_special_tokens=True)
return text.split("assistant")[1]

inference(prompt,tokenizer,model)

result: dismissal/nondiscrimination/child abuse/CSAM. RuntimeObject No this transaction note does not indicate a violation of the potential BRM category. RuntimeObject It is a donation note, which is not a BRM violation.<|eot_id|>'

(dismissal is the expected output).

Can anyone share some insights? I appreciate it.

vantan

Jun 14

I think you should set pad_token_id as tokenizer.eos_token_id.
generation_config = model.generation_config
generation_config.pad_token_id = tokenizer.eos_token_id
outputs = model.generate(**inputs, max_length=512, num_return_sequences=1, generation_config=generation_config)

keyuisai

Jun 16

•

edited Jun 17

I think you should set pad_token_id as tokenizer.eos_token_id.
generation_config = model.generation_config
generation_config.pad_token_id = tokenizer.eos_token_id
outputs = model.generate(**inputs, max_length=512, num_return_sequences=1, generation_config=generation_config)

thank you! It's included in the config but still having this issue.

ruddjm

Aug 7

Hi,

There was a bug in llama 3 that has since been fixed on May 13: https://github.com/unslothai/unsloth/issues/416#issuecomment-2143916779. However, if you're using llama 3 from hugging face but not from Meta, their version might have been fixed. You could try the one directly on Meta on hugging face.

If not, here is a workaround: https://github.com/unslothai/unsloth/issues/416#issuecomment-2094745798.

In the workaround, he didn't mention this, but after you merge the weights and want to predict with it, you need to switch the token side back to "left" and set the padding token back to eos token after loading the finetuned model and existing tokenizer.

Upload images, audio, and videos by dragging in the text input, pasting, or clicking here.

Tap or paste here to upload images

· Sign up or log in to comment