Spaces:

YenJung
/

Medical_GPT

Runtime error

App Files Files Community

YenJung commited on Nov 3, 2023

Commit

473ef6f

•

1 Parent(s): 71b9fae

Upload model.py

Browse files

Files changed (1) hide show

model.py +150 -0

model.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import sys
+from collections import namedtuple
+import click
+import torch
+from peft import PeftModel
+from transformers import (
+    AutoModel,
+    AutoTokenizer,
+    BloomForCausalLM,
+    BloomTokenizerFast,
+    GenerationConfig,
+    LlamaForCausalLM,
+    LlamaTokenizer,
+)
+from utils import generate_prompt
+def decide_model(args, device_map):
+    ModelClass = namedtuple("ModelClass", ('tokenizer', 'model'))
+    _MODEL_CLASSES = {
+        "llama": ModelClass(**{
+            "tokenizer": LlamaTokenizer,
+            "model": LlamaForCausalLM,
+        }),
+        "chatglm": ModelClass(**{
+            "tokenizer": AutoTokenizer, #ChatGLMTokenizer,
+            "model":  AutoModel, #ChatGLMForConditionalGeneration,
+        }),
+        "bloom": ModelClass(**{
+            "tokenizer": BloomTokenizerFast,
+            "model": BloomForCausalLM,
+        }),
+        "Auto": ModelClass(**{
+            "tokenizer": AutoTokenizer,
+            "model": AutoModel,
+        })
+    }
+    model_type = "Auto" if args.model_type not in ["llama", "bloom", "chatglm"] else args.model_type
+    if model_type == "chatglm":
+        tokenizer = _MODEL_CLASSES[model_type].tokenizer.from_pretrained(
+            args.base_model,
+            trust_remote_code=True
+        )
+        # todo: ChatGLMForConditionalGeneration revision
+        model = _MODEL_CLASSES[model_type].model.from_pretrained(
+            args.base_model,
+            trust_remote_code=True,
+            device_map=device_map
+        )
+    else:
+        tokenizer = _MODEL_CLASSES[model_type].tokenizer.from_pretrained(args.base_model)
+        model = _MODEL_CLASSES[model_type].model.from_pretrained(
+            args.base_model,
+            load_in_8bit=True,
+            torch_dtype=torch.float16,
+            device_map=device_map
+        )
+    if model_type == "llama":
+        tokenizer.pad_token_id = 0
+        tokenizer.padding_side = "left"  # Allow batched inference
+    if device_map == "auto":
+        model = PeftModel.from_pretrained(
+            model,
+            args.finetuned_weights,
+            torch_dtype=torch.float16,
+        )
+    else:
+        model = PeftModel.from_pretrained(
+            model,
+            args.finetuned_weights,
+            device_map=device_map
+        )
+    return tokenizer, model
+class ModelServe:
+    def __init__(
+        self,
+        load_8bit: bool = True,
+        model_type: str = "llama",
+        base_model: str = "linhvu/decapoda-research-llama-7b-hf",
+        finetuned_weights: str = "/home/holiday01/Downloads/LLaMa/alpaca-7b-chinese/finetuned/llama-7b-hf_alpaca-en-zh",
+    ):
+        args = locals()
+        namedtupler = namedtuple("args", tuple(list(args.keys())))
+        local_args = namedtupler(**args)
+        if torch.cuda.is_available():
+            self.device = "cuda:0"
+            self.device_map = "auto"
+            #self.max_memory = {i: "12GB" for i in range(torch.cuda.device_count())}
+            #self.max_memory.update({"cpu": "30GB"})
+        else:
+            self.device = "cpu"
+            self.device_map = {"": self.device}
+        self.tokenizer, self.model = decide_model(args=local_args, device_map=self.device_map)
+        # unwind broken decapoda-research config
+        self.model.config.pad_token_id = self.tokenizer.pad_token_id = 0  # unk
+        self.model.config.bos_token_id = 1
+        self.model.config.eos_token_id = 2
+        if not load_8bit:
+            self.model.half()  # seems to fix bugs for some users.
+        self.model.eval()
+        if torch.__version__ >= "2" and sys.platform != "win32":
+            self.model = torch.compile(self.model)
+    def generate(
+        self,
+        instruction: str,
+        input: str,
+        temperature: float = 0.7,
+        top_p: float = 0.75,
+        top_k: int = 40,
+        num_beams: int = 4,
+        max_new_tokens: int = 1024,
+        **kwargs
+    ):
+        prompt = generate_prompt(instruction, input)
+        print(f"Prompt: {prompt}")
+        inputs = self.tokenizer(prompt, return_tensors="pt")
+        input_ids = inputs["input_ids"].to(self.device)
+        generation_config = GenerationConfig(
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            num_beams=num_beams,
+            **kwargs,
+        )
+        print("generating...")
+        with torch.no_grad():
+            generation_output = self.model.generate(
+                input_ids=input_ids,
+                generation_config=generation_config,
+                return_dict_in_generate=True,
+                output_scores=True,
+                max_new_tokens=max_new_tokens,
+            )
+        s = generation_output.sequences[0]
+        output = self.tokenizer.decode(s)
+        print(f"Output: {output}")
+        return output.split("### 回覆：")[1].strip()