Spaces:

rashmi
/

h2oai-predict-llm

Sleeping

File size: 5,974 Bytes

import gradio as gr
import spaces

import os
import gc
import random
import warnings

warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)
from tqdm.auto import tqdm

import torch
import torch.nn as nn
import tokenizers
import transformers

print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
print(f"torch.__version__: {torch.__version__}")
print(f"torch cuda version: {torch.version.cuda}")
from transformers import AutoTokenizer, AutoConfig
from transformers import BitsAndBytesConfig, AutoModelForCausalLM, MistralForCausalLM
from peft import LoraConfig, get_peft_model


title = "H2O AI Predict the LLM"

#Theme from - https://huggingface.co/spaces/trl-lib/stack-llama/blob/main/app.py
theme = gr.themes.Monochrome(
    primary_hue="indigo",
    secondary_hue="blue",
    neutral_hue="slate",
    radius_size=gr.themes.sizes.radius_sm,
    font=[gr.themes.GoogleFont("Open Sans"), "ui-sans-serif", "system-ui", "sans-serif"],
)

### Load the model
class CFG:
    num_workers = os.cpu_count()
    llm_backbone = "HuggingFaceH4/zephyr-7b-beta"
    tokenizer_path = "HuggingFaceH4/zephyr-7b-beta"
    tokenizer = AutoTokenizer.from_pretrained(
        tokenizer_path, add_prefix_space=False, use_fast=True, trust_remote_code=True, add_eos_token=True
    )
    batch_size = 1
    max_len = 650
    seed = 42

    num_labels = 7

    lora = True
    lora_r = 4
    lora_alpha = 16
    lora_dropout = 0.05
    lora_target_modules = ""
    gradient_checkpointing = True


class CustomModel(nn.Module):
    """
    Model for causal language modeling problem type.
    """

    def __init__(self):
        super().__init__()

        self.backbone_config = AutoConfig.from_pretrained(
            CFG.llm_backbone, trust_remote_code=True
        )

        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_quant_type="nf4",
        )

        self.model = AutoModelForCausalLM.from_pretrained(
            CFG.llm_backbone,
            config=self.backbone_config,
            quantization_config=quantization_config,
        )

        if CFG.lora:
            target_modules = []
            for name, module in self.model.named_modules():
                if (
                    isinstance(module, (torch.nn.Linear, torch.nn.Conv1d))
                    and "head" not in name
                ):
                    name = name.split(".")[-1]
                    if name not in target_modules:
                        target_modules.append(name)

            lora_config = LoraConfig(
                r=CFG.lora_r,
                lora_alpha=CFG.lora_alpha,
                target_modules=target_modules,
                lora_dropout=CFG.lora_dropout,
                bias="none",
                task_type="CAUSAL_LM",
            )
            if CFG.gradient_checkpointing:
                self.model.enable_input_require_grads()
            self.model = get_peft_model(self.model, lora_config)
            self.model.print_trainable_parameters()

        self.classification_head = nn.Linear(
            self.backbone_config.vocab_size, CFG.num_labels, bias=False
        )
        self._init_weights(self.classification_head)
    
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.backbone_config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.backbone_config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(
        self,
        batch
    ):
        # disable cache if gradient checkpointing is enabled
        if CFG.gradient_checkpointing:
            self.model.config.use_cache = False

        self.model.config.pretraining_tp = 1

        output = self.model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
        )

        output.logits = self.classification_head(output[0][:, -1].float())

        # enable cache again if gradient checkpointing is enabled
        if CFG.gradient_checkpointing:
            self.model.config.use_cache = True

        return output.logits
    

### End Load the model




def do_submit(question, response):
    full_text = question + " " + response
    # result = do_inference(full_text)
    return "result"

@spaces.GPU
def greet():
    pass

with gr.Blocks(title=title) as demo: # theme=theme
    model = CustomModel()
    sample_examples = pd.read_csv('sample_examples.csv')
    example_list = sample_examples[['Question','Response','target']].sample(2).values.tolist()
    gr.Markdown(f"## {title}")
    with gr.Row():
        # with gr.Column(scale=1):
            # gr.Markdown("### Question and LLM Response")
            question_text = gr.Textbox(lines=2, placeholder="Question:", label="")
            response_text = gr.Textbox(lines=2, placeholder="Response:", label="")
            target_text = gr.Textbox(lines=1, placeholder="Target:", label="", interactive=False , visible=False)
            llm_num = gr.Textbox(value="", label="LLM #")
    with gr.Row():
            sub_btn = gr.Button("Submit")
            sub_btn.click(fn=do_submit,  inputs=[question_text, response_text], outputs=[llm_num])

    gr.Markdown("## Sample Inputs:")
    gr.Examples(
        example_list,
        [question_text,response_text,target_text],
        # cache_examples=True,   
    )

demo.launch(greet)