How to finetune the model

#10
by clause-crahm - opened

How can I finetune the model further? Inference works without problems.
I can do finetuning with the LoracConfig set to

config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["lm_head"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

but trying any other target_modules (or removing that line) leads to the error

RuntimeError: self and mat2 must have the same dtype

The full script that I am using is

from transformers import AutoTokenizer, logging
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
import argparse

from peft import prepare_model_for_kbit_training
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
import transformers


parser = argparse.ArgumentParser(description='Simple AutoGPTQ example')
parser.add_argument('model_name_or_path', type=str, help='Model folder or repo')
parser.add_argument('--model_basename', type=str, help='Model file basename if model is not named gptq_model-Xb-Ygr')
parser.add_argument('--use_slow', action="store_true", help='Use slow tokenizer')
parser.add_argument('--use_safetensors', action="store_true", help='Model file basename if model is not named gptq_model-Xb-Ygr')
parser.add_argument('--use_triton', action="store_true", help='Use Triton for inference?')
parser.add_argument('--bits', type=int, default=4, help='Specify GPTQ bits. Only needed if no quantize_config.json is provided')
parser.add_argument('--group_size', type=int, default=128, help='Specify GPTQ group_size. Only needed if no quantize_config.json is provided')
parser.add_argument('--desc_act', action="store_true", help='Specify GPTQ desc_act. Only needed if no quantize_config.json is provided')

args = parser.parse_args()
quantized_model_dir = args.model_name_or_path

tokenizer = AutoTokenizer.from_pretrained(quantized_model_dir, use_fast=not args.use_slow,
                                          unk_token="\<unk\>",
                                          bos_token="\<s\>",
                                          eos_token="\</s\>")

quantize_config = BaseQuantizeConfig.from_pretrained(quantized_model_dir)

model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir,
        use_safetensors=args.use_safetensors,
        model_basename=args.model_basename,
        device="cuda:0",
        use_triton=args.use_triton,
        quantize_config=quantize_config)

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)


data = load_dataset("Abirate/english_quotes")
data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)
data = data['train'].train_test_split(train_size=0.9, test_size=0.1)

tokenizer.pad_token = tokenizer.eos_token
trainer = transformers.Trainer(
    model=model,
    train_dataset=data["train"],
    eval_dataset=data['test'],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=3,
        learning_rate=2e-2,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit",
        evaluation_strategy='steps',
        eval_steps=1
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False
trainer.train()

I don't know, I've not tried fine tuning yet.

However you could try updating AutoGPTQ to the latest development version (git clone it and build from source), as version 0.3.0 has built-in PEFT support.

I think this will be the intended way to do LoRA training on quantised GPTQ models.

I've not tried it myself yet but I believe it does work.

I don't know, I've not tried fine tuning yet.

However you could try updating AutoGPTQ to the latest development version (git clone it and build from source), as version 0.3.0 has built-in PEFT support.

I think this will be the intended way to do LoRA training on quantised GPTQ models.

I've not tried it myself yet but I believe it does work.

Thanks, that helped, now I got it working. It required also adapting the example from examples/peft/peft_lora_clm_instruction_tuning.py with the essential difference of using their GPTQLoraConfig.

Great, glad it worked! Could you share the updated code here, so others could use it as well?

@TheBloke Sure, please see below:

# run with
# python simple_autogptq.py ./text-generation-webui/models/TheBloke_guanaco-33B-GPTQ/ --model_basename Guanaco-33B-GPTQ-4bit.act-order --use_safetensors --use_triton

import os
import argparse
from peft import prepare_model_for_kbit_training, TaskType
from datasets import load_dataset
import transformers
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM, get_gptq_peft_model, BaseQuantizeConfig
from auto_gptq.utils.peft_utils import GPTQLoraConfig

parser = argparse.ArgumentParser(description='Simple AutoGPTQ example')
parser.add_argument('model_name_or_path', type=str, help='Model folder or repo')
parser.add_argument('--model_basename', type=str, help='Model file basename if model is not named gptq_model-Xb-Ygr')
parser.add_argument('--use_slow', action="store_true", help='Use slow tokenizer')
parser.add_argument('--use_safetensors', action="store_true", help='Model file basename if model is not named gptq_model-Xb-Ygr')
parser.add_argument('--use_triton', action="store_true", help='Use Triton for inference?')
parser.add_argument('--bits', type=int, default=4, help='Specify GPTQ bits. Only needed if no quantize_config.json is provided')
parser.add_argument('--group_size', type=int, default=128, help='Specify GPTQ group_size. Only needed if no quantize_config.json is provided')
parser.add_argument('--desc_act', action="store_true", help='Specify GPTQ desc_act. Only needed if no quantize_config.json is provided')
args = parser.parse_args()

os.environ["TOKENIZERS_PARALLELISM"] = "false"

model_name_or_path = args.model_name_or_path
model_basename = args.model_basename
tokenizer_name_or_path = model_name_or_path

peft_config = GPTQLoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
)

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path,
                                          use_fast=not args.use_slow,
                                          unk_token="<unk>",
                                          bos_token="<s>",
                                          eos_token="</s>")
if not tokenizer.pad_token_id:
    tokenizer.pad_token_id = tokenizer.eos_token_id

quantize_config = BaseQuantizeConfig.from_pretrained(model_name_or_path)
model = AutoGPTQForCausalLM.from_quantized(
    model_name_or_path,
    model_basename=model_basename,
    use_safetensors=args.use_safetensors,
    use_triton=args.use_triton,
    device="cuda:0",
    trainable=True,
    inject_fused_attention=True,
    inject_fused_mlp=False,
    quantize_config=quantize_config
)
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
model = get_gptq_peft_model(model, peft_config=peft_config, auto_find_all_linears=True, train_mode=True)
model.print_trainable_parameters()

data = load_dataset("Abirate/english_quotes")
data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)
data = data['train'].train_test_split(train_size=0.9, test_size=0.1)

tokenizer.pad_token = tokenizer.eos_token
trainer = transformers.Trainer(
    model=model,
    train_dataset=data["train"],
    eval_dataset=data['test'],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=3,
        learning_rate=2e-2,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit",
        evaluation_strategy='steps',
        eval_steps=1
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False
trainer.train()

Thank you!

Thank you @clause-crahm , if its possible can you please share the full code in your GitHub (and its link)? I understand if you don't want to share.

Hi @ibibek , the code above is really all there is to it. Just make sure to have up-to-date versions of the packages.

This comment has been hidden

@clause-crahm and @TheBloke , I fintuning the model using the source you provided above with "Abirate/english_quotes " dataset, the loss seems problematical, when inference with adapter, the results are wrong , It seems the adapter does not work and the adapter 's parameters are not trained at all.

CUDA_VISIBLE_DEVICES=1 python guanaco_finetuning.py
[2023-07-23 20:48:23,070] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
The model weights are not tied. Please use the tie_weights method before using the infer_auto_device function.
The safetensors archive passed at guanaco-33B-GPTQ/guanaco-33b-GPTQ-4bit--1g.act.order.safetensors does not contain metadata. Make sure to save your model with the save_pretrained method. Defaulting to 'pt' metadata.
trainable params: 109,117,440 || all params: 4,552,823,296 || trainable%: 2.396698332128724

0%| | 0/3000 [00:00<?, ?it/s]You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the __call__ method is faster than using a method to encode the text followed by a call to the pad method to get a padded encoding.
{'loss': 0.7326, 'learning_rate': 0.01, 'epoch': 0.0}
{'loss': 0.2599, 'learning_rate': 0.02, 'epoch': 0.0}
{'loss': 1.123, 'learning_rate': 0.01999332888592395, 'epoch': 0.0}
{'loss': 9.3881, 'learning_rate': 0.0199866577718479, 'epoch': 0.0}
{'loss': 16.0878, 'learning_rate': 0.01997998665777185, 'epoch': 0.0}
{'loss': 0.0, 'learning_rate': 0.0199733155436958, 'epoch': 0.0}
{'loss': 0.0, 'learning_rate': 0.019966644429619748, 'epoch': 0.0}
{'loss': 0.0, 'learning_rate': 0.019959973315543694, 'epoch': 0.0}
{'loss': 0.0, 'learning_rate': 0.019953302201467644, 'epoch': 0.0}
{'loss': 0.0, 'learning_rate': 0.019946631087391597, 'epoch': 0.0}
{'loss': 0.0, 'learning_rate': 0.019939959973315543, 'epoch': 0.0}
{'loss': 0.0, 'learning_rate': 0.019933288859239492, 'epoch': 0.01}
{'loss': 0.0, 'learning_rate': 0.019926617745163442, 'epoch': 0.01}
{'loss': 0.0, 'learning_rate': 0.01991994663108739, 'epoch': 0.01}
{'loss': 0.0, 'learning_rate': 0.01991327551701134, 'epoch': 0.01}
{'loss': 0.0, 'learning_rate': 0.01990660440293529, 'epoch': 0.01}
{'loss': 0.0, 'learning_rate': 0.01989993328885924, 'epoch': 0.01}
{'loss': 0.0, 'learning_rate': 0.01989326217478319, 'epoch': 0.01}
{'loss': 0.0, 'learning_rate': 0.01988659106070714, 'epoch': 0.01}
{'loss': 0.0, 'learning_rate': 0.01987991994663109, 'epoch': 0.01}
{'loss': 0.0, 'learning_rate': 0.01987324883255504, 'epoch': 0.01}
{'loss': 0.0, 'learning_rate': 0.019866577718478988, 'epoch': 0.01}
{'loss': 0.0, 'learning_rate': 0.019859906604402934, 'epoch': 0.01}
{'loss': 0.0, 'learning_rate': 0.019853235490326884, 'epoch': 0.01}
{'loss': 0.0, 'learning_rate': 0.019846564376250837, 'epoch': 0.01}
{'loss': 0.0, 'learning_rate': 0.019839893262174783, 'epoch': 0.01}
{'loss': 0.0, 'learning_rate': 0.019833222148098732, 'epoch': 0.01}
{'loss': 0.0, 'learning_rate': 0.019826551034022682, 'epoch': 0.01}
{'loss': 0.0, 'learning_rate': 0.01981987991994663, 'epoch': 0.01}
{'loss': 0.0, 'learning_rate': 0.01981320880587058, 'epoch': 0.01}
{'loss': 0.0, 'learning_rate': 0.01980653769179453, 'epoch': 0.01}
{'loss': 0.0, 'learning_rate': 0.01979986657771848, 'epoch': 0.01}
{'loss': 0.0, 'learning_rate': 0.01979319546364243, 'epoch': 0.01}
{'loss': 0.0, 'learning_rate': 0.019786524349566376, 'epoch': 0.02}
{'loss': 0.0, 'learning_rate': 0.01977985323549033, 'epoch': 0.02}
{'loss': 0.0, 'learning_rate': 0.019773182121414278, 'epoch': 0.02}
{'loss': 0.0, 'learning_rate': 0.019766511007338224, 'epoch': 0.02}
{'loss': 0.0, 'learning_rate': 0.019759839893262174, 'epoch': 0.02}

@clause-crahm and @TheBloke the inference with adapter, but the results may be wrong , It seems the adapter does not work and the adapter 's parameters are not trained at all. Could you help me to check it ?

import sys
from transformers import AutoTokenizer, pipeline, logging
from peft import PeftModel, prepare_model_for_kbit_training, TaskType
from auto_gptq import AutoGPTQForCausalLM,get_gptq_peft_model, BaseQuantizeConfig
from auto_gptq.utils.peft_utils import GPTQLoraConfig

model_name_or_path = "guanaco-33B-GPTQ"
quantized_model_path="guanaco-33B-GPTQ"
model_basename = "guanaco-33b-GPTQ-4bit--1g.act.order"
checkpoint_path='guanaco-33B-GPTQ/gptq_LORA_adapter'

peft_config = GPTQLoraConfig(
r=16,
lora_alpha=32,
lora_dropout=0.1,
task_type=TaskType.CAUSAL_LM,
inference_mode=True,
)

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path,use_fast=True)
if not tokenizer.pad_token_id:
tokenizer.pad_token_id = tokenizer.eos_token_id
quantize_config = BaseQuantizeConfig.from_pretrained(model_name_or_path)
model = AutoGPTQForCausalLM.from_quantized(
model_name_or_path,
model_basename=model_basename,
use_safetensors=True,
use_triton=False,
device="cuda:0",
trainable=False,
inject_fused_attention=True,
inject_fused_mlp=False,
quantize_config=quantize_config
)

model.gradient_checkpointing_enable()
model = get_gptq_peft_model(model, peft_config=peft_config,model_id=checkpoint_path, auto_find_all_linears=True, train_mode=False)
model=PeftModel.from_pretrained(model, model_id=checkpoint_path,dapter_name="adapter_model.bin",is_trainable=False)
prompt = "How can we reduce air pollution?"
input_ids = tokenizer(prompt, return_tensors='pt').input_ids.cuda()
output = model.generate(inputs=input_ids, temperature=1, max_new_tokens=512)

Sign up or log in to comment