Spaces:

izumi-lab
/

llama-13b-japanese-lora-v0-1ep

Paused

App Files Files Community

masanorihirano commited on May 22, 2023

Commit

dc15b84

•

1 Parent(s): b3d63a6

update

Browse files

Files changed (1) hide show

app.py +40 -47

app.py CHANGED Viewed

@@ -23,11 +23,14 @@ print("starting server ...")
 BASE_MODEL = "decapoda-research/llama-13b-hf"
 LORA_WEIGHTS = "izumi-lab/llama-13b-japanese-lora-v0-1ep"
-DATASET_REPOSITORY = os.environ.get("DATASET_REPOSITORY", None)
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 repo = None
 LOCAL_DIR = "/home/user/data/"
 if HF_TOKEN and DATASET_REPOSITORY:
     try:
         shutil.rmtree(LOCAL_DIR)
@@ -42,7 +45,6 @@ if HF_TOKEN and DATASET_REPOSITORY:
     )
     repo.git_pull()
 tokenizer = LlamaTokenizer.from_pretrained(BASE_MODEL)
 if torch.cuda.is_available():
@@ -62,7 +64,7 @@ if device == "cuda":
         load_in_8bit=True,
         device_map="auto",
     )
-    model = PeftModel.from_pretrained(model, LORA_WEIGHTS, load_in_8bit=True)
 elif device == "mps":
     model = AutoModelForCausalLM.from_pretrained(
         BASE_MODEL,
@@ -77,10 +79,7 @@ elif device == "mps":
     )
 else:
     model = AutoModelForCausalLM.from_pretrained(
-        BASE_MODEL,
-        device_map={"": device},
-        low_cpu_mem_usage=True,
-        load_in_8bit=True,
     )
     model = PeftModel.from_pretrained(
         model,
@@ -91,18 +90,29 @@ else:
 def generate_prompt(instruction: str, input: Optional[str] = None):
     if input:
-        return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
 ### Instruction:
 {instruction}
 ### Input:
 {input}
 ### Response:"""
     else:
-        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
 ### Instruction:
 {instruction}
 ### Response:"""
 if device != "cpu":
@@ -114,7 +124,7 @@ if torch.__version__ >= "2":
 def save_inputs_and_outputs(now, inputs, outputs, generate_kwargs):
     current_hour = now.strftime("%Y-%m-%d_%H")
-    file_name = f"prompts_{current_hour}.jsonl"
     if repo is not None:
         repo.git_pull(rebase=True)
@@ -138,11 +148,11 @@ def evaluate(
     instruction,
     input=None,
     temperature=0.7,
-    top_p=1.0,
-    top_k=40,
-    num_beams=4,
     max_new_tokens=256,
 ):
     prompt = generate_prompt(instruction, input)
     inputs = tokenizer(prompt, return_tensors="pt")
     input_ids = inputs["input_ids"].to(device)
@@ -151,6 +161,8 @@ def evaluate(
         top_p=top_p,
         top_k=top_k,
         num_beams=num_beams,
     )
     with torch.no_grad():
         generation_output = model.generate(
@@ -161,9 +173,14 @@ def evaluate(
             max_new_tokens=max_new_tokens,
         )
     s = generation_output.sequences[0]
-    output = tokenizer.decode(s)
-    output = output.split("### Response:")[1].strip()
-    if HF_TOKEN and DATASET_REPOSITORY:
         try:
             now = datetime.datetime.now()
             current_time = now.strftime("%Y-%m-%d %H:%M:%S")
@@ -215,10 +232,10 @@ with gr.Blocks(
                         clear_button = gr.Button("Clear").style(full_width=True)
                     with gr.Column(scale=5):
                         submit_button = gr.Button("Submit").style(full_width=True)
-            outputs = gr.Textbox(lines=5, label="Output")
         # inputs, top_p, temperature, top_k, repetition_penalty
-        with gr.Accordion("Parameters", open=False):
             temperature = gr.Slider(
                 minimum=0,
                 maximum=1.0,
@@ -227,34 +244,10 @@ with gr.Blocks(
                 interactive=True,
                 label="Temperature",
             )
-            top_p = gr.Slider(
-                minimum=0,
-                maximum=1.0,
-                value=1.0,
-                step=0.05,
-                interactive=True,
-                label="Top p",
-            )
-            top_k = gr.Slider(
-                minimum=1,
-                maximum=50,
-                value=4,
-                step=1,
-                interactive=True,
-                label="Top k",
-            )
-            num_beams = gr.Slider(
-                minimum=1,
-                maximum=50,
-                value=4,
-                step=1,
-                interactive=True,
-                label="Beams",
-            )
             max_new_tokens = gr.Slider(
                 minimum=1,
-                maximum=50,
-                value=4,
                 step=1,
                 interactive=True,
                 label="Max length",
@@ -301,17 +294,17 @@ with gr.Blocks(
     inputs.submit(no_interactive, [], [submit_button, clear_button])
     inputs.submit(
         evaluate,
-        [instruction, inputs, temperature, top_p, top_k, num_beams, max_new_tokens],
         [outputs, submit_button, clear_button],
     )
     submit_button.click(no_interactive, [], [submit_button, clear_button])
     submit_button.click(
         evaluate,
-        [instruction, inputs, temperature, top_p, top_k, num_beams, max_new_tokens],
         [outputs, submit_button, clear_button],
     )
     clear_button.click(reset_textbox, [], [instruction, inputs, outputs], queue=False)
     demo.queue(max_size=20, concurrency_count=NUM_THREADS, api_open=False).launch(
-        share=False, server_name="0.0.0.0", server_port=7860
     )

 BASE_MODEL = "decapoda-research/llama-13b-hf"
 LORA_WEIGHTS = "izumi-lab/llama-13b-japanese-lora-v0-1ep"
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
+DATASET_REPOSITORY = os.environ.get("DATASET_REPOSITORY", None)
 repo = None
 LOCAL_DIR = "/home/user/data/"
+PROMPT_LANG = "en"
+assert PROMPT_LANG in ["ja", "en"]
 if HF_TOKEN and DATASET_REPOSITORY:
     try:
         shutil.rmtree(LOCAL_DIR)
     )
     repo.git_pull()
 tokenizer = LlamaTokenizer.from_pretrained(BASE_MODEL)
 if torch.cuda.is_available():
         load_in_8bit=True,
         device_map="auto",
     )
+    model = PeftModel.from_pretrained(model, LORA_WEIGHTS, load_in_8bit=True,)
 elif device == "mps":
     model = AutoModelForCausalLM.from_pretrained(
         BASE_MODEL,
     )
 else:
     model = AutoModelForCausalLM.from_pretrained(
+        BASE_MODEL, device_map={"": device},load_in_8bit=True, low_cpu_mem_usage=True
     )
     model = PeftModel.from_pretrained(
         model,
 def generate_prompt(instruction: str, input: Optional[str] = None):
+    print(f"input: {input}")
     if input:
+        if PROMPT_LANG == "ja":
+            return f"以下はタスクを説明する指示とさらなる文脈を適用する入力の組み合わせです。\n\n### 指示:\n{instruction}\n\n### 入力:\n{input}\n\n### Response:\n"
+        elif PROMPT_LANG == "en":
+            return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
 ### Instruction:
 {instruction}
 ### Input:
 {input}
 ### Response:"""
+        else:
+            raise ValueError("PROMPT_LANG")
     else:
+        if PROMPT_LANG == "ja":
+            return f"以下はタスクを説明する指示とさらなる文脈を適用する入力の組み合わせです。\n\n### 指示:\n{instruction}\n\n### 返答:\n"
+        elif PROMPT_LANG == "en":
+            return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
 ### Instruction:
 {instruction}
 ### Response:"""
+        else:
+            raise ValueError("PROMPT_LANG")
 if device != "cpu":
 def save_inputs_and_outputs(now, inputs, outputs, generate_kwargs):
     current_hour = now.strftime("%Y-%m-%d_%H")
+    file_name = f"prompts_{LORA_WEIGHTS.split('/')[-1]}{current_hour}.jsonl"
     if repo is not None:
         repo.git_pull(rebase=True)
     instruction,
     input=None,
     temperature=0.7,
     max_new_tokens=256,
 ):
+    num_beams: int = 1
+    top_p: float = 1.0
+    top_k: int = 0
     prompt = generate_prompt(instruction, input)
     inputs = tokenizer(prompt, return_tensors="pt")
     input_ids = inputs["input_ids"].to(device)
         top_p=top_p,
         top_k=top_k,
         num_beams=num_beams,
+        pad_token_id=tokenizer.pad_token_id,
+        eos_token=tokenizer.eos_token_id,
     )
     with torch.no_grad():
         generation_output = model.generate(
             max_new_tokens=max_new_tokens,
         )
     s = generation_output.sequences[0]
+    output = tokenizer.decode(s, skip_special_tokens=True)
+    if prompt.endswith("Response:"):
+        output = output.split("### Response:")[1].strip()
+    elif prompt.endswith("返答:"):
+        output = output.split("### 返答:")[1].strip()
+    else:
+        raise ValueError(f"No valid prompt ends. {prompt}")
+    if HF_TOKEN:
         try:
             now = datetime.datetime.now()
             current_time = now.strftime("%Y-%m-%d %H:%M:%S")
                         clear_button = gr.Button("Clear").style(full_width=True)
                     with gr.Column(scale=5):
                         submit_button = gr.Button("Submit").style(full_width=True)
+            outputs = gr.Textbox(lines=4, label="Output")
         # inputs, top_p, temperature, top_k, repetition_penalty
+        with gr.Accordion("Parameters", open=True):
             temperature = gr.Slider(
                 minimum=0,
                 maximum=1.0,
                 interactive=True,
                 label="Temperature",
             )
             max_new_tokens = gr.Slider(
                 minimum=1,
+                maximum=256,
+                value=128,
                 step=1,
                 interactive=True,
                 label="Max length",
     inputs.submit(no_interactive, [], [submit_button, clear_button])
     inputs.submit(
         evaluate,
+        [instruction, inputs, temperature, max_new_tokens],
         [outputs, submit_button, clear_button],
     )
     submit_button.click(no_interactive, [], [submit_button, clear_button])
     submit_button.click(
         evaluate,
+        [instruction, inputs, temperature, max_new_tokens],
         [outputs, submit_button, clear_button],
     )
     clear_button.click(reset_textbox, [], [instruction, inputs, outputs], queue=False)
     demo.queue(max_size=20, concurrency_count=NUM_THREADS, api_open=False).launch(
+        server_name="0.0.0.0", server_port=7860
     )