simple-llm-finetuner

Build error

File size: 16,155 Bytes

import os
import argparse
import torch
import gradio as gr
import transformers

from datasets import Dataset
from transformers import LlamaForCausalLM, LlamaTokenizer, GenerationConfig
from peft import prepare_model_for_int8_training, LoraConfig, get_peft_model, PeftModel

model = None
tokenizer = None
peft_model = None

def maybe_load_models():
    global model
    global tokenizer

    if model is None:
        model = LlamaForCausalLM.from_pretrained(
            "decapoda-research/llama-7b-hf",
            load_in_8bit=True,
            torch_dtype=torch.float16,
            device_map="auto",
        )

    if tokenizer is None:
        tokenizer = LlamaTokenizer.from_pretrained(
            "decapoda-research/llama-7b-hf",
        )

    return model, tokenizer

def reset_models():
    global model
    global tokenizer

    del model
    del tokenizer

    model = None
    tokenizer = None

def generate_text(
    model_name, 
    text, 
    temperature, 
    top_p, 
    top_k, 
    repeat_penalty,
    max_new_tokens,
    progress=gr.Progress(track_tqdm=True)
):
    model, tokenizer = maybe_load_models()

    if model_name and model_name != "None":
        model = PeftModel.from_pretrained(
            model, model_name,
            torch_dtype=torch.float16
        )

    inputs = tokenizer(text, return_tensors="pt")
    input_ids = inputs["input_ids"].to(model.device)
    generation_config = GenerationConfig(
        # Controls the 'temperature' of the softmax distribution during sampling.
        # Higher values (e.g., 1.0) make the model generate more diverse and random outputs, 
        # while lower values (e.g., 0.1) make it more deterministic and 
        # focused on the highest probability tokens.
        temperature=temperature,  

        # Sets the nucleus sampling threshold. In nucleus sampling, 
        # only the tokens whose cumulative probability exceeds 'top_p' are considered 
        # for sampling. This technique helps to reduce the number of low probability 
        # tokens considered during sampling, which can lead to more diverse and coherent outputs.
        top_p=top_p,  

        # Sets the number of top tokens to consider during sampling. 
        # In top-k sampling, only the 'top_k' tokens with the highest probabilities 
        # are considered for sampling. This method can lead to more focused and coherent 
        # outputs by reducing the impact of low probability tokens.
        top_k=top_k,  

        # Applies a penalty to the probability of tokens that have already been generated, 
        # discouraging the model from repeating the same words or phrases. The penalty is
        # applied by dividing the token probability by a factor based on the number of times 
        # the token has appeared in the generated text.
        repeat_penalty=repeat_penalty,

        # Limits the maximum number of tokens generated in a single iteration. 
        # This can be useful to control the length of generated text, especially in tasks 
        # like text summarization or translation, where the output should not be excessively long.
        max_new_tokens=max_new_tokens,  
    )


    with torch.no_grad():
        generation_output = model.generate(
            input_ids=input_ids,
            attention_mask=torch.ones_like(input_ids),
            generation_config=generation_config,
            return_dict_in_generate=True,
            output_scores=True,
        )

    output = []
    for token_id in generation_output[0]:
        new = tokenizer.decode(token_id, skip_special_tokens=True)
        output.append(new)
        print(new, end=" ", flush=True)

    return ''.join(output).strip()

def tokenize_and_train(
    training_text,
    max_seq_length,
    micro_batch_size,
    gradient_accumulation_steps,
    epochs,
    learning_rate,
    lora_r,
    lora_alpha,
    lora_dropout,
    model_name,
    progress=gr.Progress(track_tqdm=True)
):
    model, tokenizer = maybe_load_models()

    tokenizer.pad_token_id = 0

    paragraphs = training_text.split("\n\n\n")
    print("Number of samples: " + str(len(paragraphs)))
        
    def tokenize(item):
        result = tokenizer(
            item["text"],
            truncation=True,
            max_length=max_seq_length,
            padding="max_length",
        )
        return {
            "input_ids": result["input_ids"][:-1],
            "attention_mask": result["attention_mask"][:-1],
        }

    def to_dict(text):
        return {"text": text}

    paragraphs = [to_dict(x) for x in paragraphs]
    data = Dataset.from_list(paragraphs)
    data = data.shuffle().map(lambda x: tokenize(x))

    model = prepare_model_for_int8_training(model)

    model = get_peft_model(model, LoraConfig(
        r=lora_r,
        lora_alpha=lora_alpha,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=lora_dropout,
        bias="none",
        task_type="CAUSAL_LM",
    ))

    output_dir = f"lora-{model_name}"

    print("Training...")

    training_args = transformers.TrainingArguments(
        # Set the batch size for training on each device (GPU, CPU, or TPU).
        per_device_train_batch_size=micro_batch_size, 

        # Number of steps for gradient accumulation. This is useful when the total 
        # batch size is too large to fit in GPU memory. The effective batch size 
        # will be the product of 'per_device_train_batch_size' and 'gradient_accumulation_steps'.
        gradient_accumulation_steps=gradient_accumulation_steps,  

        # Number of warmup steps for the learning rate scheduler. During these steps, 
        # the learning rate increases linearly from 0 to its initial value. Warmup helps
        #  to reduce the risk of very large gradients at the beginning of training, 
        # which could destabilize the model.
        # warmup_steps=100, 

        # The total number of training steps. The training process will end once this 
        # number is reached, even if not all the training epochs are completed.
        # max_steps=1500, 

        # The total number of epochs (complete passes through the training data) 
        # to perform during the training process.
        num_train_epochs=epochs,  

        # The initial learning rate to be used during training.
        learning_rate=learning_rate, 

        # Enables mixed precision training using 16-bit floating point numbers (FP16). 
        # This can speed up training and reduce GPU memory consumption without 
        # sacrificing too much model accuracy.
        fp16=True,  

        # The frequency (in terms of steps) of logging training metrics and statistics 
        # like loss, learning rate, etc. In this case, it logs after every 20 steps.
        logging_steps=20, 

        # The output directory where the trained model, checkpoints, 
        # and other training artifacts will be saved.
        output_dir=output_dir, 

        # The maximum number of checkpoints to keep. When this limit is reached, 
        # the oldest checkpoint will be deleted to save a new one. In this case, 
        # a maximum of 3 checkpoints will be kept.
        save_total_limit=3,  
    )


    trainer = transformers.Trainer(
        # The pre-trained model that you want to fine-tune or train from scratch. 
        # 'model' should be an instance of a Hugging Face Transformer model, such as BERT, GPT-2, T5, etc.
        model=model, 

        # The dataset to be used for training. 'data' should be a PyTorch Dataset or 
        # a compatible format, containing the input samples and labels or masks (if required).
        train_dataset=data, 

        # The TrainingArguments instance created earlier, which contains various 
        # hyperparameters and configurations for the training process.
        args=training_args, 

        # A callable that takes a batch of samples and returns a batch of inputs for the model. 
        # This is used to prepare the input samples for training by batching, padding, and possibly masking.
        data_collator=transformers.DataCollatorForLanguageModeling( 
            tokenizer,  
            # Whether to use masked language modeling (MLM) during training. 
            # MLM is a training technique used in models like BERT, where some tokens in the 
            # input are replaced by a mask token, and the model tries to predict the 
            # original tokens. In this case, MLM is set to False, indicating that it will not be used.
            mlm=False, 
        ),
    )

    result = trainer.train(resume_from_checkpoint=False)

    model.save_pretrained(output_dir)
    
    reset_models()

    return result


with gr.Blocks(css="#refresh-button { max-width: 32px }") as demo:
    with gr.Tab("Finetuning"):

        with gr.Column():
            training_text = gr.Textbox(lines=12, label="Training Data", info="Each sequence must be separated by a double newline")

            max_seq_length = gr.Slider(
                minimum=1, maximum=4096, value=512,
                label="Max Sequence Length", 
                info="The maximum length of each sample text sequence. Sequences longer than this will be truncated."
            )

        with gr.Row():
            with gr.Column():
                micro_batch_size = gr.Slider(
                    minimum=1, maximum=100, value=1, 
                    label="Micro Batch Size", 
                    info="The number of examples in each mini-batch for gradient computation. A smaller micro_batch_size reduces memory usage but may increase training time."
                )

                gradient_accumulation_steps = gr.Slider(
                    minimum=1, maximum=10, value=1, 
                    label="Gradient Accumulation Steps", 
                    info="The number of steps to accumulate gradients before updating model parameters. This can be used to simulate a larger effective batch size without increasing memory usage."
                )

                epochs = gr.Slider(
                    minimum=1, maximum=100, value=1, 
                    label="Epochs",
                    info="The number of times to iterate over the entire training dataset. A larger number of epochs may improve model performance but also increase the risk of overfitting.")

                learning_rate = gr.Slider(
                    minimum=0.00001, maximum=0.01, value=3e-4,
                    label="Learning Rate",
                    info="The initial learning rate for the optimizer. A higher learning rate may speed up convergence but also cause instability or divergence. A lower learning rate may require more steps to reach optimal performance but also avoid overshooting or oscillating around local minima."
                )

            with gr.Column():
                lora_r = gr.Slider(
                    minimum=1, maximum=16, value=8, 
                    label="LoRA R",
                    info="The rank parameter for LoRA, which controls the dimensionality of the rank decomposition matrices. A larger lora_r increases the expressiveness and flexibility of LoRA but also increases the number of trainable parameters and memory usage."
                )

                lora_alpha = gr.Slider(
                    minimum=1, maximum=128, value=16, 
                    label="LoRA Alpha",
                    info="The scaling parameter for LoRA, which controls how much LoRA affects the original pre-trained model weights. A larger lora_alpha amplifies the impact of LoRA but may also distort or override the pre-trained knowledge."
                )
                
                lora_dropout = gr.Slider(
                    minimum=0, maximum=1, value=0.01,
                    label="LoRA Dropout",
                    info="The dropout probability for LoRA, which controls the fraction of LoRA parameters that are set to zero during training. A larger lora_dropout increases the regularization effect of LoRA but also increases the risk of underfitting."
                )

                with gr.Column():
                    model_name = gr.Textbox(
                        lines=1, label="LoRA Model Name", value=""
                    )

                    with gr.Row():
                        train_btn = gr.Button(
                            "Train", variant="primary", label="Train", 
                        )

                        abort_button = gr.Button(
                            "Abort", label="Abort", 
                        )
    
        output_text = gr.Text("Training Status")

        train_progress = train_btn.click(
            fn=tokenize_and_train,
            inputs=[
                training_text,
                max_seq_length,
                micro_batch_size,
                gradient_accumulation_steps,
                epochs,
                learning_rate,
                lora_r,
                lora_alpha,
                lora_dropout,
                model_name
            ],
            outputs=output_text
        )

        abort_button.click(None, None, None, cancels=[train_progress])

    with gr.Tab("Inference"):
        with gr.Row():
            with gr.Column():
                with gr.Row():
                        lora_model = gr.Dropdown(
                            label="LoRA Model",
                        )
                        refresh_models_list = gr.Button(
                            "Reload Models",
                            elem_id="refresh-button"
                        )
                inference_text = gr.Textbox(lines=7, label="Input Text")   
            inference_output = gr.Textbox(lines=12, label="Output Text")
        with gr.Row():
            with gr.Column():
                #  temperature, top_p, top_k, repeat_penalty, max_new_tokens
                temperature = gr.Slider(
                    minimum=0, maximum=2, value=0.7, step=0.1,
                    label="Temperature",
                    info=""
                )

                top_p = gr.Slider(
                    minimum=0, maximum=1, value=0.2, step=0.1,
                    label="Top P",
                    info=""
                )

                top_k = gr.Slider(
                    minimum=0, maximum=100, value=50, step=1,
                    label="Top K",
                    info=""
                )

                repeat_penalty = gr.Slider(
                    minimum=0, maximum=1, value=0.8, step=0.1,
                    label="Repeat Penalty",
                    info=""
                )

                max_new_tokens = gr.Slider(
                    minimum=0, maximum=4096, value=50, step=1,
                    label="Max New Tokens",
                    info=""
                )
            with gr.Column():
                with gr.Row():
                    generate_btn = gr.Button(
                        "Generate", variant="primary", label="Generate", 
                    )

                    inference_abort_button = gr.Button(
                        "Abort", label="Abort", 
                    )
            
        inference_progress = generate_btn.click(
            fn=generate_text,
            inputs=[
                lora_model,
                inference_text,
                temperature,
                top_p,
                top_k,
                repeat_penalty,
                max_new_tokens
            ],
            outputs=inference_output,
        )

        lora_model.change(
            fn=reset_models
        )

        def update_models_list():
            return gr.Dropdown.update(choices=["None"] + [
                d for d in os.listdir() if os.path.isdir(d) and d.startswith('lora-')
            ], value="None")

        refresh_models_list.click(
            update_models_list,  
            inputs=None, 
            outputs=lora_model,
        )

    

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Simple LLaMA Finetuner")
    parser.add_argument("-s", "--share", action="store_true", help="Enable sharing of the Gradio interface")
    args = parser.parse_args()

    demo.queue().launch(share=args.share)