Spaces:

rashmi
/

h2oai-predict-llm

Sleeping

App Files Files Community

rashmi commited on Nov 9, 2023

Commit

294768e

•

1 Parent(s): 1924eb4

update

Browse files

Files changed (1) hide show

app.py +101 -97

app.py CHANGED Viewed

@@ -41,118 +41,123 @@ theme = gr.themes.Monochrome(
     font=[gr.themes.GoogleFont("Open Sans"), "ui-sans-serif", "system-ui", "sans-serif"],
 )
-### Load the model
-class CFG:
-    num_workers = os.cpu_count()
-    llm_backbone = "HuggingFaceH4/zephyr-7b-beta"
-    tokenizer_path = "HuggingFaceH4/zephyr-7b-beta"
-    tokenizer = AutoTokenizer.from_pretrained(
-        tokenizer_path, add_prefix_space=False, use_fast=True, trust_remote_code=True, add_eos_token=True
-    )
-    batch_size = 1
-    max_len = 650
-    seed = 42
-    num_labels = 7
-    lora = True
-    lora_r = 4
-    lora_alpha = 16
-    lora_dropout = 0.05
-    lora_target_modules = ""
-    gradient_checkpointing = True
-class CustomModel(nn.Module):
-    """
-    Model for causal language modeling problem type.
-    """
-    def __init__(self):
-        super().__init__()
-        self.backbone_config = AutoConfig.from_pretrained(
-            CFG.llm_backbone, trust_remote_code=True
-        )
-        quantization_config = BitsAndBytesConfig(
-            load_in_4bit=True,
-            bnb_4bit_compute_dtype=torch.float16,
-            bnb_4bit_quant_type="nf4",
-        )
-        self.model = AutoModelForCausalLM.from_pretrained(
-            CFG.llm_backbone,
-            config=self.backbone_config,
-            quantization_config=quantization_config,
-        )
-        if CFG.lora:
-            target_modules = []
-            for name, module in self.model.named_modules():
-                if (
-                    isinstance(module, (torch.nn.Linear, torch.nn.Conv1d))
-                    and "head" not in name
-                ):
-                    name = name.split(".")[-1]
-                    if name not in target_modules:
-                        target_modules.append(name)
-            lora_config = LoraConfig(
-                r=CFG.lora_r,
-                lora_alpha=CFG.lora_alpha,
-                target_modules=target_modules,
-                lora_dropout=CFG.lora_dropout,
-                bias="none",
-                task_type="CAUSAL_LM",
             )
             if CFG.gradient_checkpointing:
-                self.model.enable_input_require_grads()
-            self.model = get_peft_model(self.model, lora_config)
-            self.model.print_trainable_parameters()
-        self.classification_head = nn.Linear(
-            self.backbone_config.vocab_size, CFG.num_labels, bias=False
-        )
-        self._init_weights(self.classification_head)
-    def _init_weights(self, module):
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=self.backbone_config.initializer_range)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=self.backbone_config.initializer_range)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-        elif isinstance(module, nn.LayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-    def forward(
-        self,
-        batch
-    ):
-        # disable cache if gradient checkpointing is enabled
-        if CFG.gradient_checkpointing:
-            self.model.config.use_cache = False
-        self.model.config.pretraining_tp = 1
-        output = self.model(
-            input_ids=batch["input_ids"],
-            attention_mask=batch["attention_mask"],
-        )
-        output.logits = self.classification_head(output[0][:, -1].float())
-        # enable cache again if gradient checkpointing is enabled
-        if CFG.gradient_checkpointing:
-            self.model.config.use_cache = True
-        return output.logits
 ### End Load the model
@@ -160,7 +165,7 @@ class CustomModel(nn.Module):
 def do_submit(question, response):
     full_text = question + " " + response
-    # result = do_inference(full_text)
     return "result"
 @spaces.GPU
@@ -168,7 +173,6 @@ def greet():
     pass
 with gr.Blocks(title=title) as demo: # theme=theme
-    model = CustomModel()
     sample_examples = pd.read_csv('sample_examples.csv')
     example_list = sample_examples[['Question','Response','target']].sample(2).values.tolist()
     gr.Markdown(f"## {title}")

     font=[gr.themes.GoogleFont("Open Sans"), "ui-sans-serif", "system-ui", "sans-serif"],
 )
+def do_inference(full_text):
+    ### Load the model
+    class CFG:
+        num_workers = os.cpu_count()
+        llm_backbone = "HuggingFaceH4/zephyr-7b-beta"
+        tokenizer_path = "HuggingFaceH4/zephyr-7b-beta"
+        tokenizer = AutoTokenizer.from_pretrained(
+            tokenizer_path, add_prefix_space=False, use_fast=True, trust_remote_code=True, add_eos_token=True
+        )
+        batch_size = 1
+        max_len = 650
+        seed = 42
+        num_labels = 7
+        lora = True
+        lora_r = 4
+        lora_alpha = 16
+        lora_dropout = 0.05
+        lora_target_modules = ""
+        gradient_checkpointing = True
+    class CustomModel(nn.Module):
+        """
+        Model for causal language modeling problem type.
+        """
+        def __init__(self):
+            super().__init__()
+            self.backbone_config = AutoConfig.from_pretrained(
+                CFG.llm_backbone, trust_remote_code=True
+            )
+            quantization_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_compute_dtype=torch.float16,
+                bnb_4bit_quant_type="nf4",
+            )
+            self.model = AutoModelForCausalLM.from_pretrained(
+                CFG.llm_backbone,
+                config=self.backbone_config,
+                quantization_config=quantization_config,
+            )
+            if CFG.lora:
+                target_modules = []
+                for name, module in self.model.named_modules():
+                    if (
+                        isinstance(module, (torch.nn.Linear, torch.nn.Conv1d))
+                        and "head" not in name
+                    ):
+                        name = name.split(".")[-1]
+                        if name not in target_modules:
+                            target_modules.append(name)
+                lora_config = LoraConfig(
+                    r=CFG.lora_r,
+                    lora_alpha=CFG.lora_alpha,
+                    target_modules=target_modules,
+                    lora_dropout=CFG.lora_dropout,
+                    bias="none",
+                    task_type="CAUSAL_LM",
+                )
+                if CFG.gradient_checkpointing:
+                    self.model.enable_input_require_grads()
+                self.model = get_peft_model(self.model, lora_config)
+                self.model.print_trainable_parameters()
+            self.classification_head = nn.Linear(
+                self.backbone_config.vocab_size, CFG.num_labels, bias=False
             )
+            self._init_weights(self.classification_head)
+        def _init_weights(self, module):
+            if isinstance(module, nn.Linear):
+                module.weight.data.normal_(mean=0.0, std=self.backbone_config.initializer_range)
+                if module.bias is not None:
+                    module.bias.data.zero_()
+            elif isinstance(module, nn.Embedding):
+                module.weight.data.normal_(mean=0.0, std=self.backbone_config.initializer_range)
+                if module.padding_idx is not None:
+                    module.weight.data[module.padding_idx].zero_()
+            elif isinstance(module, nn.LayerNorm):
+                module.bias.data.zero_()
+                module.weight.data.fill_(1.0)
+        def forward(
+            self,
+            batch
+        ):
+            # disable cache if gradient checkpointing is enabled
             if CFG.gradient_checkpointing:
+                self.model.config.use_cache = False
+            self.model.config.pretraining_tp = 1
+            output = self.model(
+                input_ids=batch["input_ids"],
+                attention_mask=batch["attention_mask"],
+            )
+            output.logits = self.classification_head(output[0][:, -1].float())
+            # enable cache again if gradient checkpointing is enabled
+            if CFG.gradient_checkpointing:
+                self.model.config.use_cache = True
+            return output.logits
+    model = CustomModel()
+    return "result"
 ### End Load the model
 def do_submit(question, response):
     full_text = question + " " + response
+    result = do_inference(full_text)
     return "result"
 @spaces.GPU
     pass
 with gr.Blocks(title=title) as demo: # theme=theme
     sample_examples = pd.read_csv('sample_examples.csv')
     example_list = sample_examples[['Question','Response','target']].sample(2).values.tolist()
     gr.Markdown(f"## {title}")