problem after training the model

#35
by ivgome - opened

We have managed to launch the training script by providing our own dataset, following this guide.
However, we can launch the model in chatbot format before the training, but we are unable to launch it once it has been trained, as the ram consumption skyrockets, can we modify any parameter at configuration level to solve this problem?
We are currently following these steps, in colab free.
https://colab.research.google.com/drive/1n5U13L0Bzhs32QO_bls5jwuZR62GPSwE?usp=sharing#scrollTo=zlw7IxfUED0a

Databricks org

It's not clear what you're doing or on what hardware, but, just sounds like you don't have enough mem to load it? you need to load on a GPU

srowen changed discussion status to closed

Hi @ivgome your code is running well in colab free with few modifications - mostly due to memory limitations (see copy and paste of your file with adjustments below which is running):

-- coding: utf-8 --

"""Fine-tuning Dolly 2.0 with LoRA and Alpaca.ipynb

Automatically generated by Colaboratory.

Original file is located at
https://colab.research.google.com/drive/1-nyF2tdV7jOvxqR3OCw7Bv2IyZQpp_5D

Fine-tuning Dolly 2.0 with LoRA

!git clone https://github.com/gururise/AlpacaDataCleaned.git

ls AlpacaDataCleaned/

!pip install accelerate>=0.21.0 transformers[torch]==4.30.2
!pip install -q datasets loralib sentencepiece
!pip -q install git+https://github.com/huggingface/peft.git
!pip -q install bitsandbytes

Create Instruct Pipeline

import logging
import re

import numpy as np
from transformers import Pipeline, PreTrainedTokenizer

logger = logging.getLogger(name)

INSTRUCTION_KEY = "### Instruction:"
RESPONSE_KEY = "### Response:"
END_KEY = "### End"
INTRO_BLURB = (
"Below is an instruction that describes a task. Write a response that appropriately completes the request."
)

This is the prompt that is used for generating responses using an already trained model. It ends with the response

key, where the job of the model is to provide the completion that follows it (i.e. the response itself).

PROMPT_FOR_GENERATION_FORMAT = """{intro}
{instruction_key}
{instruction}
{response_key}
""".format(
intro=INTRO_BLURB,
instruction_key=INSTRUCTION_KEY,
instruction="{instruction}",
response_key=RESPONSE_KEY,
)

def get_special_token_id(tokenizer: PreTrainedTokenizer, key: str) -> int:
"""Gets the token ID for a given string that has been added to the tokenizer as a special token.
When training, we configure the tokenizer so that the sequences like "### Instruction:" and "### End" are
treated specially and converted to a single, new token. This retrieves the token ID each of these keys map to.
Args:
tokenizer (PreTrainedTokenizer): the tokenizer
key (str): the key to convert to a single token
Raises:
RuntimeError: if more than one ID was generated
Returns:
int: the token ID for the given key
"""
token_ids = tokenizer.encode(key)
if len(token_ids) > 1:
raise ValueError(f"Expected only a single token for '{key}' but found {token_ids}")
return token_ids[0]

class InstructionTextGenerationPipeline(Pipeline):
def init(
self, *args, do_sample: bool = True, max_new_tokens: int = 256, top_p: float = 0.92, top_k: int = 0, **kwargs
):
super().init(*args, do_sample=do_sample, max_new_tokens=max_new_tokens, top_p=top_p, top_k=top_k, **kwargs)

def _sanitize_parameters(self, return_instruction_text=False, **generate_kwargs):
    preprocess_params = {}

    # newer versions of the tokenizer configure the response key as a special token.  newer versions still may
    # append a newline to yield a single token.  find whatever token is configured for the response key.
    tokenizer_response_key = next(
        (token for token in self.tokenizer.additional_special_tokens if token.startswith(RESPONSE_KEY)), None
    )

    response_key_token_id = None
    end_key_token_id = None
    if tokenizer_response_key:
        try:
            response_key_token_id = get_special_token_id(self.tokenizer, tokenizer_response_key)
            end_key_token_id = get_special_token_id(self.tokenizer, END_KEY)

            # Ensure generation stops once it generates "### End"
            generate_kwargs["eos_token_id"] = end_key_token_id
        except ValueError:
            pass

    forward_params = generate_kwargs
    postprocess_params = {
        "response_key_token_id": response_key_token_id,
        "end_key_token_id": end_key_token_id,
        "return_instruction_text": return_instruction_text,
    }

    return preprocess_params, forward_params, postprocess_params

def preprocess(self, instruction_text, **generate_kwargs):
    prompt_text = PROMPT_FOR_GENERATION_FORMAT.format(instruction=instruction_text)
    inputs = self.tokenizer(
        prompt_text,
        return_tensors="pt",
    )
    inputs["prompt_text"] = prompt_text
    inputs["instruction_text"] = instruction_text
    return inputs

def _forward(self, model_inputs, **generate_kwargs):
    input_ids = model_inputs["input_ids"]
    attention_mask = model_inputs.get("attention_mask", None)
    generated_sequence = self.model.generate(
        input_ids=input_ids.to(self.model.device),
        attention_mask=attention_mask,
        pad_token_id=self.tokenizer.pad_token_id,
        **generate_kwargs,
    )[0].cpu()
    instruction_text = model_inputs.pop("instruction_text")
    return {"generated_sequence": generated_sequence, "input_ids": input_ids, "instruction_text": instruction_text}

def postprocess(self, model_outputs, response_key_token_id, end_key_token_id, return_instruction_text):
    sequence = model_outputs["generated_sequence"]
    instruction_text = model_outputs["instruction_text"]

    # The response will be set to this variable if we can identify it.
    decoded = None

    # If we have token IDs for the response and end, then we can find the tokens and only decode between them.
    if response_key_token_id and end_key_token_id:
        # Find where "### Response:" is first found in the generated tokens.  Considering this is part of the
        # prompt, we should definitely find it.  We will return the tokens found after this token.
        response_pos = None
        response_positions = np.where(sequence == response_key_token_id)[0]
        if len(response_positions) == 0:
            logger.warn(f"Could not find response key {response_key_token_id} in: {sequence}")
        else:
            response_pos = response_positions[0]

        if response_pos:
            # Next find where "### End" is located.  The model has been trained to end its responses with this
            # sequence (or actually, the token ID it maps to, since it is a special token).  We may not find
            # this token, as the response could be truncated.  If we don't find it then just return everything
            # to the end.  Note that even though we set eos_token_id, we still see the this token at the end.
            end_pos = None
            end_positions = np.where(sequence == end_key_token_id)[0]
            if len(end_positions) > 0:
                end_pos = end_positions[0]

            decoded = self.tokenizer.decode(sequence[response_pos + 1 : end_pos]).strip()
    else:
        # Otherwise we'll decode everything and use a regex to find the response and end.

        fully_decoded = self.tokenizer.decode(sequence)

        # The response appears after "### Response:".  The model has been trained to append "### End" at the
        # end.
        m = re.search(r"#+\s*Response:\s*(.+?)#+\s*End", fully_decoded, flags=re.DOTALL)

        if m:
            decoded = m.group(1).strip()
        else:
            # The model might not generate the "### End" sequence before reaching the max tokens.  In this case,
            # return everything after "### Response:".
            m = re.search(r"#+\s*Response:\s*(.+)", fully_decoded, flags=re.DOTALL)
            if m:
                decoded = m.group(1).strip()
            else:
                logger.warn(f"Failed to find response in:\n{fully_decoded}")

    if return_instruction_text:
        return {"instruction_text": instruction_text, "generated_text": decoded}

    return decoded

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

tokenizer = AutoTokenizer.from_pretrained("databricks/dolly-v2-3b", padding_side="left")

quantization_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True)

model = AutoModelForCausalLM.from_pretrained("databricks/dolly-v2-3b",
device_map="auto",
torch_dtype=torch.bfloat16,
#torch_dtype=torch.int8,
quantization_config=quantization_config,
load_in_4bit=True,
#load_in_8bit=True,
)

generate_text = InstructionTextGenerationPipeline(model=model, tokenizer=tokenizer)

from datasets import load_dataset

data = load_dataset("json",
data_files="./AlpacaDataCleaned/alpaca_data.json")

def generate_prompt(data_point):
# taken from https://github.com/tloen/alpaca-lora
if data_point["instruction"]:
return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

Instruction:

{data_point["instruction"]}

Input:

{data_point["input"]}

Response:

{data_point["output"]}"""
else:
return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

Instruction:

{data_point["instruction"]}

Response:

{data_point["output"]}"""

data = data.map(lambda data_point: {"prompt": tokenizer(generate_prompt(data_point))})

data

"""## Finetuning Dolly"""

import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import torch
import torch.nn as nn
import bitsandbytes as bnb
from datasets import load_dataset
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig, GPTJForCausalLM

from peft import prepare_model_for_int8_training, LoraConfig, get_peft_model

Settings for A100 - For 3090

MICRO_BATCH_SIZE = 4 # change to 4 for 3090
BATCH_SIZE = 128
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
EPOCHS = 2 # paper uses 3
LEARNING_RATE = 2e-5
CUTOFF_LEN = 256
LORA_R = 4
LORA_ALPHA = 16
LORA_DROPOUT = 0.05

Settings for A100 - For 3090

MICRO_BATCH_SIZE = 4 # change to 4 for 3090
BATCH_SIZE = 32
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
EPOCHS = 2 # paper uses 3
LEARNING_RATE = 2e-5
CUTOFF_LEN = 32
LORA_R = 4
LORA_ALPHA = 16
LORA_DROPOUT = 0.05

#model = prepare_model_for_int8_training(model, use_gradient_checkpointing=True)

config = LoraConfig(
r=LORA_R,
lora_alpha=LORA_ALPHA,
lora_dropout=LORA_DROPOUT,
bias="none",
task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)
tokenizer.pad_token_id = 0 # unk. we want this to be different from the eos token

data = load_dataset("json", data_files="./AlpacaDataCleaned/alpaca_data_cleaned.json")

data = data.shuffle().map(
lambda data_point: tokenizer(
generate_prompt(data_point),
truncation=True,
max_length=CUTOFF_LEN,
padding="max_length",
)
)

data

from transformers.training_args import ParallelMode

trainer = transformers.Trainer(
model=model,
train_dataset=data["train"],
args=transformers.TrainingArguments(
per_device_train_batch_size=MICRO_BATCH_SIZE,
gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
warmup_steps=100,
num_train_epochs=EPOCHS,
learning_rate=LEARNING_RATE,
fp16=True,
#sharded_ddp="zero_dp_3 auto_wrap",
# fsdp="full_shard auto_wrap",
# model_parallel=True,
#parallel_mode=ParallelMode.DISTRIBUTED,
#is_model_parallel=True,
logging_steps=1,
output_dir="lora-dolly",
save_total_limit=3,
) ,
data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False
trainer.train(resume_from_checkpoint=False)

model.save_pretrained("alpaca-lora-dolly-2.0")

generate_text = InstructionTextGenerationPipeline(model=model, tokenizer=tokenizer)

generate_text("Look up the boiling point of water.")

generate_text("Find the capital of Spain.")

generate_text("Translate the following phrase into French: I love my dog")

generate_text("Given a set of numbers, find the maximum value: Set: {10, 3, 25, 62, 16}")

generate_text("Translate the following phrase into French: I love my dog")

Sign up or log in to comment