Fine tuned Llama3 doesn't stop generating tokens when it should.

#142
by keyuisai - opened

Hello, I finetuned Meta-Llama-3-8B-Instruct model. I used required prompt template and added special tokens. When I use the fine tuned model for inferencing, it can generate the right answer, but won't stop right there. Instead it continues to generate a bunch of random texts. Not sure if I did anything wrong here.

def load_base_model(self):
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(
self.base_model,
quantization_config=bnb_config,
device_map=self.device_map,
attn_implementation='eager')

    self.tokenizer = AutoTokenizer.from_pretrained(self.base_model)
    # model, self.tokenizer = setup_chat_format(model, self.tokenizer)
    
    config = LoraConfig(
            r=self.lora_r,
            lora_alpha=self.lora_alpha,
            target_modules=self.target_modules,
            lora_dropout=self.lora_dropout,
            bias="none",
            task_type="CAUSAL_LM",
        )
    model = get_peft_model(model, config)
    print_parameter_status(model)
    
    return model, self.tokenizer

def start_finetune(self, model, train_data, val_data):
trainer = SFTTrainer(
model=model,
train_dataset=train_data,
eval_dataset=val_data,
args=transformers.TrainingArguments(
resume_from_checkpoint=None,
per_device_train_batch_size=self.params.get('micro_batch_size',10),
gradient_accumulation_steps=self.params.get('gradient_accumulation_steps',10),
warmup_steps=self.params.get('warmup_steps',5),
num_train_epochs=self.params.get('num_epochs',3),
learning_rate=self.params.get('learning_rate',2e-4),
fp16=self.params.get('fp16',False),
logging_steps=self.params.get('logging_steps',5),
optim=self.params.get('optim','adamw_torch'),
lr_scheduler_type=self.params.get('lr_scheduler_type','cosine') ,
evaluation_strategy="steps",
save_strategy="steps",
eval_steps=5,
save_steps=self.params.get(save_steps,5) ,
output_dir=self.params.get('cp_path') ,
save_total_limit=50,
load_best_model_at_end=True,
ddp_find_unused_parameters=None,
group_by_length=self.params.get('group_by_length',False) ,
report_to='tensorboard',
run_name=None,
),
peft_config = LoraConfig(
r=self.lora_r,
lora_alpha=self.lora_alpha,
target_modules=self.target_modules,
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM",
),
dataset_text_field = 'full_prompt',
max_seq_length = 256,
packing= False
)
model.config.use_cache = False
trainer.train()

train_js = load_dataset("json", data_files = f'interim_dataset/train_test_clockwork_v4/train_df.jsonl',split="all")
val_js = load_dataset("json", data_files = f'interim_dataset/train_test_clockwork_v4/val_df.jsonl',split="all")

def format_chat_template(row):
template = f"""<|start_header_id|>system<|end_header_id|> {row['instruction']}<|eot_id|><|start_header_id|>user<|end_header_id|> {row['input']}<|eot_id|><|start_header_id|>assistant<|end_header_id|> {row['output']}<|eot_id|>"""
row["full_prompt"] = template
return row

train_data = train_js.map(format_chat_template)
val_data = val_js.map(format_chat_template)

Inference:
def merge_model(base_model,new_model):
tokenizer = AutoTokenizer.from_pretrained(base_model)
base_model_reload = AutoModelForCausalLM.from_pretrained(
base_model,
return_dict=True,
low_cpu_mem_usage=True,
torch_dtype=torch.float16,
device_map="auto",
trust_remote_code=True,)
model = PeftModel.from_pretrained(base_model_reload, new_model)
model = model.merge_and_unload()
return model,tokenizer

model,tokenizer = merge_model(base_model_path,new_model_path)

def format_chat_template_inf(row):
template = f"<|start_header_id|>system<|end_header_id|> {row['instruction'].values[0]}<|eot_id|><|start_header_id|>user<|end_header_id|> {row['input'].values[0]}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""
return template

def inference(prompt,tokenizer,model):
inputs = tokenizer(prompt, return_tensors='pt',
truncation=True).to("cuda")
outputs = model.generate(**inputs, max_length=512,
num_return_sequences=1)
text = tokenizer.decode(outputs[0], skip_special_tokens=True)
return text.split("assistant")[1]

inference(prompt,tokenizer,model)

result: dismissal/nondiscrimination/child abuse/CSAM. RuntimeObject No this transaction note does not indicate a violation of the potential BRM category. RuntimeObject It is a donation note, which is not a BRM violation.<|eot_id|>'

(dismissal is the expected output).

Can anyone share some insights? I appreciate it.

I think you should set pad_token_id as tokenizer.eos_token_id.
generation_config = model.generation_config
generation_config.pad_token_id = tokenizer.eos_token_id
outputs = model.generate(**inputs, max_length=512, num_return_sequences=1, generation_config=generation_config)

I think you should set pad_token_id as tokenizer.eos_token_id.
generation_config = model.generation_config
generation_config.pad_token_id = tokenizer.eos_token_id
outputs = model.generate(**inputs, max_length=512, num_return_sequences=1, generation_config=generation_config)

thank you! It's included in the config but still having this issue.
Screenshot 2024-06-16 at 10.07.01 PM.png

Sign up or log in to comment