Spaces:

izumi-lab
/

llama-13b-japanese-lora-v0-1ep

Paused

App Files Files Community

masanorihirano commited on May 30, 2023

Commit

6bde6cb

•

1 Parent(s): dfd9622

WIP

Browse files

Files changed (2) hide show

app.py +140 -137
pyproject.toml +2 -2

app.py CHANGED Viewed

@@ -4,21 +4,94 @@ import os
 import shutil
 from typing import Optional
 from typing import Tuple
 import gradio as gr
 import requests
 import torch
-from fastchat.serve.inference import compress_module
-from fastchat.serve.inference import raise_warning_for_old_weights
 from huggingface_hub import Repository
-from huggingface_hub import hf_hub_download
 from huggingface_hub import snapshot_download
 from peft import LoraConfig
 from peft import get_peft_model
 from peft import set_peft_model_state_dict
 from transformers import AutoModelForCausalLM
-from transformers import GenerationConfig
-from transformers import LlamaTokenizer
 print(datetime.datetime.now())
@@ -29,15 +102,15 @@ print(NUM_THREADS)
 print("starting server ...")
 BASE_MODEL = "decapoda-research/llama-13b-hf"
-LORA_WEIGHTS = "izumi-lab/llama-13b-japanese-lora-v0-1ep"
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 DATASET_REPOSITORY = os.environ.get("DATASET_REPOSITORY", None)
 SLACK_WEBHOOK = os.environ.get("SLACK_WEBHOOK", None)
 repo = None
 LOCAL_DIR = "/home/user/data/"
-PROMPT_LANG = "en"
-assert PROMPT_LANG in ["ja", "en"]
 if HF_TOKEN and DATASET_REPOSITORY:
     try:
@@ -53,85 +126,34 @@ if HF_TOKEN and DATASET_REPOSITORY:
     )
     repo.git_pull()
-tokenizer = LlamaTokenizer.from_pretrained(BASE_MODEL)
 if torch.cuda.is_available():
     device = "cuda"
 else:
     device = "cpu"
-try:
-    if torch.backends.mps.is_available():
-        device = "mps"
-except Exception:
-    pass
-resume_from_checkpoint = snapshot_download(
-    repo_id=LORA_WEIGHTS, use_auth_token=HF_TOKEN
-)
-checkpoint_name = hf_hub_download(
-    repo_id=LORA_WEIGHTS, filename="adapter_model.bin", use_auth_token=HF_TOKEN
 )
-if device == "cuda":
-    model = AutoModelForCausalLM.from_pretrained(
-        BASE_MODEL, load_in_8bit=True, device_map="auto", torch_dtype=torch.float16
-    )
-elif device == "mps":
-    model = AutoModelForCausalLM.from_pretrained(
-        BASE_MODEL,
-        device_map={"": device},
-        load_in_8bit=True,
-        torch_dtype=torch.float16,
-    )
-else:
-    model = AutoModelForCausalLM.from_pretrained(
-        BASE_MODEL,
-        device_map={"": device},
-        load_in_8bit=True,
-        low_cpu_mem_usage=True,
-        torch_dtype=torch.float16,
-    )
-config = LoraConfig.from_pretrained(resume_from_checkpoint)
-model = get_peft_model(model, config)
-adapters_weights = torch.load(checkpoint_name)
-set_peft_model_state_dict(model, adapters_weights)
-raise_warning_for_old_weights(BASE_MODEL, model)
-compress_module(model, device)
-# if device == "cuda" or device == "mps":
-#     model = model.to(device)
-def generate_prompt(instruction: str, input: Optional[str] = None):
-    if input:
-        if PROMPT_LANG == "ja":
-            return f"以下はタスクを説明する指示とさらなる文脈を適用する入力の組み合わせです。\n\n### 指示:\n{instruction}\n\n### 入力:\n{input}\n\n### Response:\n"
-        elif PROMPT_LANG == "en":
-            return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
-### Instruction:
-{instruction}
-### Input:
-{input}
-### Response:"""
-        else:
-            raise ValueError("PROMPT_LANG")
-    else:
-        if PROMPT_LANG == "ja":
-            return f"以下はタスクを説明する指示とさらなる文脈を適用する入力の組み合わせです。\n\n### 指示:\n{instruction}\n\n### 返答:\n"
-        elif PROMPT_LANG == "en":
-            return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
-### Instruction:
-{instruction}
-### Response:"""
-        else:
-            raise ValueError("PROMPT_LANG")
-if device != "cpu":
-    model.half()
-model.eval()
-if torch.__version__ >= "2":
-    model = torch.compile(model)
 def save_inputs_and_outputs(now, inputs, outputs, generate_kwargs):
@@ -158,20 +180,15 @@ def save_inputs_and_outputs(now, inputs, outputs, generate_kwargs):
 # https://github.com/gradio-app/gradio/issues/3514
 def evaluate(
     instruction,
-    input=None,
     temperature=0.7,
-    max_tokens=384,
     repetition_penalty=1.0,
 ):
     try:
-        if temperature < 1e-8:
-            temperature = 1e-8
-        num_beams: int = 1
-        top_p: float = 0.75
-        top_k: int = 40
-        prompt = generate_prompt(instruction, input)
-        inputs = tokenizer(prompt, return_tensors="pt")
-        if len(inputs["input_ids"][0]) > max_tokens - 10:
             if HF_TOKEN and DATASET_REPOSITORY:
                 try:
                     now = datetime.datetime.now()
@@ -179,13 +196,10 @@ def evaluate(
                     print(f"[{current_time}] Pushing prompt and completion to the Hub")
                     save_inputs_and_outputs(
                         now,
-                        prompt,
                         "",
                         {
                             "temperature": temperature,
-                            "top_p": top_p,
-                            "top_k": top_k,
-                            "num_beams": num_beams,
                             "max_tokens": max_tokens,
                             "repetition_penalty": repetition_penalty,
                         },
@@ -193,37 +207,34 @@ def evaluate(
                 except Exception as e:
                     print(e)
             return (
-                f"please reduce the input length. Currently, {len(inputs['input_ids'][0])} ( > {max_tokens - 10}) tokens are used.",
                 gr.update(interactive=True),
                 gr.update(interactive=True),
             )
-        input_ids = inputs["input_ids"].to(device)
-        generation_config = GenerationConfig(
-            do_sample=False,
-            temperature=temperature,
-            top_p=top_p,
-            top_k=top_k,
-            repetition_penalty=repetition_penalty,
-            num_beams=num_beams,
-            pad_token_id=tokenizer.pad_token_id,
-            eos_token=tokenizer.eos_token_id,
-        )
-        with torch.no_grad():
-            generation_output = model.generate(
-                input_ids=input_ids,
-                generation_config=generation_config,
-                return_dict_in_generate=True,
-                output_scores=True,
-                max_new_tokens=max_tokens - len(input_ids),
-            )
-        s = generation_output.sequences[0]
-        output = tokenizer.decode(s, skip_special_tokens=True)
-        if prompt.endswith("Response:"):
-            output = output.split("### Response:")[1].strip()
-        elif prompt.endswith("返答:"):
-            output = output.split("### 返答:")[1].strip()
-        else:
-            raise ValueError(f"No valid prompt ends. {prompt}")
         if HF_TOKEN and DATASET_REPOSITORY:
             try:
                 now = datetime.datetime.now()
@@ -235,9 +246,6 @@ def evaluate(
                     output,
                     {
                         "temperature": temperature,
-                        "top_p": top_p,
-                        "top_k": top_k,
-                        "num_beams": num_beams,
                         "max_tokens": max_tokens,
                         "repetition_penalty": repetition_penalty,
                     },
@@ -258,6 +266,7 @@ def evaluate(
                 "username": "Hugging Face Space",
                 "channel": "#monitor",
             }
             try:
                 requests.post(SLACK_WEBHOOK, data=json.dumps(payload_dic))
             except Exception:
@@ -371,25 +380,19 @@ with gr.Blocks(
                 visible=True
             )
-    accept_button.click(
-        fn=enable_inputs,
-        inputs=[],
-        outputs=[user_consent_block, main_block],
-        queue=False,
-    )
-    inputs.submit(no_interactive, [], [submit_button, clear_button])
-    inputs.submit(
-        evaluate,
-        [instruction, inputs, temperature, max_tokens, repetition_penalty],
-        [outputs, submit_button, clear_button],
-    )
     submit_button.click(no_interactive, [], [submit_button, clear_button])
     submit_button.click(
         evaluate,
-        [instruction, inputs, temperature, max_tokens, repetition_penalty],
         [outputs, submit_button, clear_button],
     )
-    clear_button.click(reset_textbox, [], [instruction, inputs, outputs], queue=False)
     demo.queue(max_size=20, concurrency_count=NUM_THREADS, api_open=False).launch(
         server_name="0.0.0.0", server_port=7860

 import shutil
 from typing import Optional
 from typing import Tuple
+from typing import Union
 import gradio as gr
 import requests
 import torch
+from fastchat.conversation import Conversation
+from fastchat.conversation import SeparatorStyle
+from fastchat.conversation import get_conv_template
+from fastchat.conversation import register_conv_template
+from fastchat.model.model_adapter import BaseAdapter
+from fastchat.model.model_adapter import load_model
+from fastchat.model.model_adapter import model_adapters
+from fastchat.serve.cli import SimpleChatIO
+from fastchat.serve.inference import generate_stream
 from huggingface_hub import Repository
 from huggingface_hub import snapshot_download
 from peft import LoraConfig
+from peft import PeftModel
 from peft import get_peft_model
 from peft import set_peft_model_state_dict
 from transformers import AutoModelForCausalLM
+from transformers import AutoTokenizer
+from transformers import PreTrainedModel
+from transformers import PreTrainedTokenizerBase
+class FastTokenizerAvailableBaseAdapter(BaseAdapter):
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        try:
+            tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
+        except ValueError:
+            tokenizer = AutoTokenizer.from_pretrained(model_path)
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path, low_cpu_mem_usage=True, **from_pretrained_kwargs
+        )
+        return model, tokenizer
+model_adapters[-1] = FastTokenizerAvailableBaseAdapter()
+def load_lora_model(
+    model_path: str,
+    lora_weight: str,
+    device: str,
+    num_gpus: int,
+    max_gpu_memory: Optional[str] = None,
+    load_8bit: bool = False,
+    cpu_offloading: bool = False,
+    debug: bool = False,
+) -> Tuple[Union[PreTrainedModel, PeftModel], PreTrainedTokenizerBase]:
+    model: Union[PreTrainedModel, PeftModel]
+    tokenizer: PreTrainedTokenizerBase
+    model, tokenizer = load_model(
+        model_path=model_path,
+        device=device,
+        num_gpus=num_gpus,
+        max_gpu_memory=max_gpu_memory,
+        load_8bit=load_8bit,
+        cpu_offloading=cpu_offloading,
+        debug=debug,
+    )
+    if lora_weight is not None:
+        # model = PeftModelForCausalLM.from_pretrained(model, model_path, **kwargs)
+        config = LoraConfig.from_pretrained(lora_weight)
+        model = get_peft_model(model, config)
+        # Check the available weights and load them
+        checkpoint_name = os.path.join(
+            lora_weight, "pytorch_model.bin"
+        )  # Full checkpoint
+        if not os.path.exists(checkpoint_name):
+            checkpoint_name = os.path.join(
+                lora_weight, "adapter_model.bin"
+            )  # only LoRA model - LoRA config above has to fit
+        # The two files above have a different name depending on how they were saved,
+        # but are actually the same.
+        if os.path.exists(checkpoint_name):
+            adapters_weights = torch.load(checkpoint_name)
+            set_peft_model_state_dict(model, adapters_weights)
+        else:
+            raise IOError(f"Checkpoint {checkpoint_name} not found")
+    if debug:
+        print(model)
+    return model, tokenizer
 print(datetime.datetime.now())
 print("starting server ...")
 BASE_MODEL = "decapoda-research/llama-13b-hf"
+LORA_WEIGHTS_HF = "izumi-lab/llama-13b-japanese-lora-v0-1ep"
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 DATASET_REPOSITORY = os.environ.get("DATASET_REPOSITORY", None)
 SLACK_WEBHOOK = os.environ.get("SLACK_WEBHOOK", None)
+LORA_WEIGHTS = snapshot_download(LORA_WEIGHTS_HF)
 repo = None
 LOCAL_DIR = "/home/user/data/"
 if HF_TOKEN and DATASET_REPOSITORY:
     try:
     )
     repo.git_pull()
 if torch.cuda.is_available():
     device = "cuda"
 else:
     device = "cpu"
+model, tokenizer = load_lora_model(
+    model_path=BASE_MODEL,
+    lora_weight=LORA_WEIGHTS,
+    device=device,
+    num_gpus=1,
+    max_gpu_memory="16GiB",
+    load_8bit=False,
+    cpu_offloading=False,
+    debug=False,
 )
+Conversation._get_prompt = Conversation.get_prompt
+Conversation._append_message = Conversation.append_message
+def conversation_append_message(cls, role: str, message: str):
+    cls.offset = -2
+    return cls._append_message(role, message)
+def conversation_get_prompt_overrider(cls: Conversation) -> str:
+    cls.messages = cls.messages[-2:]
+    return cls._get_prompt()
 def save_inputs_and_outputs(now, inputs, outputs, generate_kwargs):
 # https://github.com/gradio-app/gradio/issues/3514
 def evaluate(
     instruction,
     temperature=0.7,
+    max_tokens=256,
     repetition_penalty=1.0,
 ):
     try:
+        conv_template = "japanese"
+        inputs = tokenizer(instruction, return_tensors="pt")
+        if len(inputs["input_ids"][0]) > max_tokens - 40:
             if HF_TOKEN and DATASET_REPOSITORY:
                 try:
                     now = datetime.datetime.now()
                     print(f"[{current_time}] Pushing prompt and completion to the Hub")
                     save_inputs_and_outputs(
                         now,
+                        instruction,
                         "",
                         {
                             "temperature": temperature,
                             "max_tokens": max_tokens,
                             "repetition_penalty": repetition_penalty,
                         },
                 except Exception as e:
                     print(e)
             return (
+                f"please reduce the input length. Currently, {len(inputs['input_ids'][0])} ( > {max_tokens - 40}) tokens are used.",
                 gr.update(interactive=True),
                 gr.update(interactive=True),
             )
+        conv = get_conv_template(conv_template)
+        conv.append_message(conv.roles[0], instruction)
+        conv.append_message(conv.roles[1], None)
+        generate_stream_func = generate_stream
+        prompt = conv.get_prompt()
+        gen_params = {
+            "model": BASE_MODEL,
+            "prompt": prompt,
+            "temperature": temperature,
+            "max_new_tokens": max_tokens - len(inputs["input_ids"][0]) - 30,
+            "stop": conv.stop_str,
+            "stop_token_ids": conv.stop_token_ids,
+            "echo": False,
+            "repetition_penalty": repetition_penalty,
+        }
+        chatio = SimpleChatIO()
+        chatio.prompt_for_output(conv.roles[1])
+        output_stream = generate_stream_func(model, tokenizer, gen_params, device)
+        output = chatio.stream_output(output_stream)
         if HF_TOKEN and DATASET_REPOSITORY:
             try:
                 now = datetime.datetime.now()
                     output,
                     {
                         "temperature": temperature,
                         "max_tokens": max_tokens,
                         "repetition_penalty": repetition_penalty,
                     },
                 "username": "Hugging Face Space",
                 "channel": "#monitor",
             }
             try:
                 requests.post(SLACK_WEBHOOK, data=json.dumps(payload_dic))
             except Exception:
                 visible=True
             )
+        accept_button.click(
+            fn=enable_inputs,
+            inputs=[],
+            outputs=[user_consent_block, main_block],
+            queue=False,
+        )
     submit_button.click(no_interactive, [], [submit_button, clear_button])
     submit_button.click(
         evaluate,
+        [instruction, temperature, max_tokens, repetition_penalty],
         [outputs, submit_button, clear_button],
     )
+    clear_button.click(reset_textbox, [], [instruction, outputs], queue=False)
     demo.queue(max_size=20, concurrency_count=NUM_THREADS, api_open=False).launch(
         server_name="0.0.0.0", server_port=7860

pyproject.toml CHANGED Viewed

@@ -15,8 +15,8 @@ huggingface-hub = "^0.14.1"
 sentencepiece = "^0.1.99"
 bitsandbytes = "^0.38.1"
 accelerate = "^0.19.0"
-fschat = "^0.2.3"
-transformers = "^4.29.2"
 [tool.poetry.group.dev.dependencies]

 sentencepiece = "^0.1.99"
 bitsandbytes = "^0.38.1"
 accelerate = "^0.19.0"
+fschat = "0.2.8"
+transformers = "4.28.1"
 [tool.poetry.group.dev.dependencies]