File size: 8,130 Bytes
95d56dc
c94defe
2235d63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95d56dc
2235d63
 
 
 
 
 
 
 
 
 
 
 
 
 
95d56dc
700d0e6
 
f4d8307
9f535b4
700d0e6
 
 
0ca0182
 
 
 
 
 
 
 
95d56dc
4b09f99
c1671d1
 
b5fc670
c1671d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
294768e
e28c898
c1671d1
 
 
 
 
e28c898
c1671d1
 
 
 
 
e28c898
c1671d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e28c898
 
c1671d1
 
 
294768e
c1671d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e28c898
c1671d1
e28c898
c1671d1
 
 
e28c898
c1671d1
 
 
 
4b09f99
 
f4d8307
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4c68a95
f4d8307
 
 
 
 
 
e28c898
f4d8307
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e28c898
 
 
 
0ca0182
 
294768e
f4d8307
95d56dc
7ebfc36
 
 
 
0ca0182
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7ebfc36
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
import gradio as gr
import spaces

import os
import gc
import random
import warnings

warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)
from tqdm.auto import tqdm

import torch
import torch.nn as nn
import tokenizers
import transformers

print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
print(f"torch.__version__: {torch.__version__}")
print(f"torch cuda version: {torch.version.cuda}")
from transformers import AutoTokenizer, AutoConfig
from transformers import BitsAndBytesConfig, AutoModelForCausalLM, MistralForCausalLM
from peft import LoraConfig, get_peft_model


title = "H2O AI Predict the LLM"

description =" The objective of this [competition](https://www.kaggle.com/competitions/h2oai-predict-the-llm) was to \
detect which out of 7 possible LLM models produced a particular response. \n\n\
This demo is utilizing finetuned HuggingFaceH4/zephyr-7b-beta model for a multiclass classification task. \n\n \
We ranked 3rd out of more than 100 participants and our team's solution is [here](https://www.kaggle.com/competitions/h2oai-predict-the-llm/discussion/453728)"

title = title + "\n" + description

#Theme from - https://huggingface.co/spaces/trl-lib/stack-llama/blob/main/app.py
theme = gr.themes.Monochrome(
    primary_hue="indigo",
    secondary_hue="blue",
    neutral_hue="slate",
    radius_size=gr.themes.sizes.radius_sm,
    font=[gr.themes.GoogleFont("Open Sans"), "ui-sans-serif", "system-ui", "sans-serif"],
)

### Load the model
class CFG:
    num_workers = os.cpu_count()
    llm_backbone = "HuggingFaceH4/zephyr-7b-beta" #"save_pretrained_model/zephyr-7b-beta"
    tokenizer_path = "HuggingFaceH4/zephyr-7b-beta"
    tokenizer = AutoTokenizer.from_pretrained(
        tokenizer_path, add_prefix_space=False, use_fast=True, trust_remote_code=True, add_eos_token=True
    )
    batch_size = 1
    max_len = 650
    seed = 42

    num_labels = 7

    lora = True
    lora_r = 4
    lora_alpha = 16
    lora_dropout = 0.05
    lora_target_modules = ""
    gradient_checkpointing = True


class CustomModel(nn.Module):
    """
    Model for causal language modeling problem type.
    """

    def __init__(self):
        super().__init__()

        self.backbone_config = AutoConfig.from_pretrained(
            CFG.llm_backbone, trust_remote_code=True
        )

        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_quant_type="nf4",
        )

        self.model = AutoModelForCausalLM.from_pretrained(
            CFG.llm_backbone,
            config=self.backbone_config,
            quantization_config=quantization_config,
        )

        if CFG.lora:
            target_modules = []
            for name, module in self.model.named_modules():
                if (
                    isinstance(module, (torch.nn.Linear, torch.nn.Conv1d))
                    and "head" not in name
                ):
                    name = name.split(".")[-1]
                    if name not in target_modules:
                        target_modules.append(name)

            lora_config = LoraConfig(
                r=CFG.lora_r,
                lora_alpha=CFG.lora_alpha,
                target_modules=target_modules,
                lora_dropout=CFG.lora_dropout,
                bias="none",
                task_type="CAUSAL_LM",
            )
            if CFG.gradient_checkpointing:
                self.model.enable_input_require_grads()
            self.model = get_peft_model(self.model, lora_config)
            self.model.print_trainable_parameters()

        self.classification_head = nn.Linear(
            self.backbone_config.vocab_size, CFG.num_labels, bias=False
        )
        self._init_weights(self.classification_head)
    
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.backbone_config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.backbone_config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(
        self,
        batch
    ):
        # disable cache if gradient checkpointing is enabled
        if CFG.gradient_checkpointing:
            self.model.config.use_cache = False

        self.model.config.pretraining_tp = 1

        output = self.model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
        )

        output.logits = self.classification_head(output[0][:, -1].float())

        # enable cache again if gradient checkpointing is enabled
        if CFG.gradient_checkpointing:
            self.model.config.use_cache = True

        return output.logits
    
model = CustomModel()
### End Load the model

def do_inference(full_text):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model_paths = [
    'model_finetuned/HuggingFaceH4-zephyr-7b-beta_fold0_best.pth']

    def prepare_input(cfg, text):
        inputs = cfg.tokenizer.encode_plus(
            text,
            return_tensors=None,
            add_special_tokens=True,
            max_length=CFG.max_len,
            pad_to_max_length=True,
            truncation="longest_first",
        )
        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long)
        return inputs

    state = torch.load(model_paths[0], map_location=torch.device("cpu"))
    model.load_state_dict(state["model"] ,strict=False)
    model.eval()
    model.to(device)

    inputs = prepare_input(CFG, full_text)
    inputs["input_ids"] = inputs["input_ids"].reshape(1, -1).to(device)
    inputs["attention_mask"] = inputs["attention_mask"].reshape(1, -1).to(device)
    
    with torch.no_grad():
        with torch.cuda.amp.autocast(
            enabled=True, dtype=torch.float16, cache_enabled=True
        ):
            y_preds = model(inputs)
    y_preds = y_preds.detach().to("cpu").numpy().astype(np.float32)
    y_preds=  torch.softmax(torch.tensor(y_preds), 1).numpy()

    result = np.argmax(y_preds)

    if result == 0:
        return "0. llama2-70b-chat"
    elif result == 1:
        return "1. wizardLM-13b"
    elif result == 2:
        return "2. llama2-13b-chat"
    elif result == 3:
        return "3. wizardLM-70b"
    elif result == 4:
        return "4. llama2-7b-chat"
    elif result == 5:
        return "5. tinyllama-1b-chat"
    elif result == 6:
        return "6. mistral-7b-openorca"
    else:
        return "Error"




def do_submit(question, response):
    full_text = question + " " + response
    result = do_inference(full_text)
    return result

@spaces.GPU
def greet():
    pass

with gr.Blocks(title=title) as demo: # theme=theme
    sample_examples = pd.read_csv('sample_examples.csv')
    example_list = sample_examples[['Question','Response','target']].sample(2).values.tolist()
    gr.Markdown(f"## {title}")
    with gr.Row():
            question_text = gr.Textbox(lines=2, placeholder="Question:", label="")
            response_text = gr.Textbox(lines=2, placeholder="Response:", label="")
            target_text = gr.Textbox(lines=1, placeholder="Target:", label="", interactive=False , visible=False)
            llm_num = gr.Textbox(value="", label="LLM #")
    with gr.Row():
            sub_btn = gr.Button("Submit")
            sub_btn.click(fn=do_submit,  inputs=[question_text, response_text], outputs=[llm_num])

    gr.Markdown("## Sample Inputs:")
    gr.Examples(
        example_list,
        [question_text,response_text,target_text],
    )

demo.launch(greet)