rashmi's picture
update
a453482
import gradio as gr
import spaces
import os
import gc
import random
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)
from tqdm.auto import tqdm
import torch
import torch.nn as nn
import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
print(f"torch.__version__: {torch.__version__}")
print(f"torch cuda version: {torch.version.cuda}")
from transformers import AutoTokenizer, AutoConfig
from transformers import BitsAndBytesConfig, AutoModelForCausalLM, MistralForCausalLM
from peft import LoraConfig, get_peft_model
title = "H2O AI Predict the LLM"
description =" The objective of this [competition](https://www.kaggle.com/competitions/h2oai-predict-the-llm) was to \
detect which out of 7 possible LLM models produced a particular response. \n\n\
This demo is utilizing finetuned HuggingFaceH4/zephyr-7b-beta model for a multiclass classification task. \n\n \
We ranked 3rd out of more than 100 participants and our team's solution is [here](https://www.kaggle.com/competitions/h2oai-predict-the-llm/discussion/453728)"
title = title + "\n" + description
#Theme from - https://huggingface.co/spaces/trl-lib/stack-llama/blob/main/app.py
theme = gr.themes.Monochrome(
primary_hue="indigo",
secondary_hue="blue",
neutral_hue="slate",
radius_size=gr.themes.sizes.radius_sm,
font=[gr.themes.GoogleFont("Open Sans"), "ui-sans-serif", "system-ui", "sans-serif"],
)
### Load the model
class CFG:
num_workers = os.cpu_count()
llm_backbone = "HuggingFaceH4/zephyr-7b-beta" #"save_pretrained_model/zephyr-7b-beta"
tokenizer_path = "HuggingFaceH4/zephyr-7b-beta"
tokenizer = AutoTokenizer.from_pretrained(
tokenizer_path, add_prefix_space=False, use_fast=True, trust_remote_code=True, add_eos_token=True
)
batch_size = 1
max_len = 650
seed = 42
num_labels = 7
lora = True
lora_r = 4
lora_alpha = 16
lora_dropout = 0.05
lora_target_modules = ""
gradient_checkpointing = True
class CustomModel(nn.Module):
"""
Model for causal language modeling problem type.
"""
def __init__(self):
super().__init__()
self.backbone_config = AutoConfig.from_pretrained(
CFG.llm_backbone, trust_remote_code=True
)
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_quant_type="nf4",
)
self.model = AutoModelForCausalLM.from_pretrained(
CFG.llm_backbone,
config=self.backbone_config,
quantization_config=quantization_config,
)
if CFG.lora:
target_modules = []
for name, module in self.model.named_modules():
if (
isinstance(module, (torch.nn.Linear, torch.nn.Conv1d))
and "head" not in name
):
name = name.split(".")[-1]
if name not in target_modules:
target_modules.append(name)
lora_config = LoraConfig(
r=CFG.lora_r,
lora_alpha=CFG.lora_alpha,
target_modules=target_modules,
lora_dropout=CFG.lora_dropout,
bias="none",
task_type="CAUSAL_LM",
)
if CFG.gradient_checkpointing:
self.model.enable_input_require_grads()
self.model = get_peft_model(self.model, lora_config)
self.model.print_trainable_parameters()
self.classification_head = nn.Linear(
self.backbone_config.vocab_size, CFG.num_labels, bias=False
)
self._init_weights(self.classification_head)
def _init_weights(self, module):
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=self.backbone_config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.backbone_config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
def forward(
self,
batch
):
# disable cache if gradient checkpointing is enabled
if CFG.gradient_checkpointing:
self.model.config.use_cache = False
self.model.config.pretraining_tp = 1
output = self.model(
input_ids=batch["input_ids"],
attention_mask=batch["attention_mask"],
)
output.logits = self.classification_head(output[0][:, -1].float())
# enable cache again if gradient checkpointing is enabled
if CFG.gradient_checkpointing:
self.model.config.use_cache = True
return output.logits
model = CustomModel()
### End Load the model
def do_inference(full_text):
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_paths = [
'model_finetuned/HuggingFaceH4-zephyr-7b-beta_fold0_best.pth']
def prepare_input(cfg, text):
inputs = cfg.tokenizer.encode_plus(
text,
return_tensors=None,
add_special_tokens=True,
max_length=CFG.max_len,
pad_to_max_length=True,
truncation="longest_first",
)
for k, v in inputs.items():
inputs[k] = torch.tensor(v, dtype=torch.long)
return inputs
state = torch.load(model_paths[0], map_location=torch.device("cpu"))
model.load_state_dict(state["model"] ,strict=False)
model.eval()
model.to(device)
inputs = prepare_input(CFG, full_text)
inputs["input_ids"] = inputs["input_ids"].reshape(1, -1).to(device)
inputs["attention_mask"] = inputs["attention_mask"].reshape(1, -1).to(device)
with torch.no_grad():
with torch.cuda.amp.autocast(
enabled=True, dtype=torch.float16, cache_enabled=True
):
y_preds = model(inputs)
y_preds = y_preds.detach().to("cpu").numpy().astype(np.float32)
y_preds= torch.softmax(torch.tensor(y_preds), 1).numpy()
result = np.argmax(y_preds)
if result == 0:
return "0. llama2-70b-chat"
elif result == 1:
return "1. wizardLM-13b"
elif result == 2:
return "2. llama2-13b-chat"
elif result == 3:
return "3. wizardLM-70b"
elif result == 4:
return "4. llama2-7b-chat"
elif result == 5:
return "5. tinyllama-1b-chat"
elif result == 6:
return "6. mistral-7b-openorca"
else:
return "Error"
def do_submit(question, response):
full_text = question + " " + response
result = do_inference(full_text)
return result
@spaces.GPU
def greet():
pass
with gr.Blocks(title=title) as demo: # theme=theme
sample_examples = pd.read_csv('sample_examples.csv')
example_list = sample_examples[['Question','Response','target']].sample(2).values.tolist()
gr.Markdown(f"## {title}")
with gr.Row():
question_text = gr.Textbox(lines=2, placeholder="Question:", label="")
response_text = gr.Textbox(lines=2, placeholder="Response:", label="")
target_text = gr.Textbox(lines=1, placeholder="Target:", label="", interactive=False , visible=False)
llm_num = gr.Textbox(value="", label="LLM #")
with gr.Row():
sub_btn = gr.Button("Submit")
sub_btn.click(fn=do_submit, inputs=[question_text, response_text], outputs=[llm_num])
gr.Markdown("## Sample Inputs:")
gr.Examples(
example_list,
[question_text,response_text,target_text],
)
demo.launch(greet)