Spaces:

zetavg
/

LLaMA-LoRA-Tuner-UI-Demo

Runtime error

App Files Files Community

zetavg commited on Apr 9, 2023

Commit

883e16a

•

1 Parent(s): 4623b35

resolve generation canceling issue

Browse files

Files changed (2) hide show

llama_lora/globals.py +4 -0
llama_lora/ui/inference_ui.py +105 -15

llama_lora/globals.py CHANGED Viewed

@@ -25,6 +25,10 @@ class Global:
     # Training Control
     should_stop_training = False
     # Model related
     model_has_been_used = False
     loaded_base_model_with_lora = None

     # Training Control
     should_stop_training = False
+    # Generation Control
+    should_stop_generating = False
+    generation_force_stopped_at = None
     # Model related
     model_has_been_used = False
     loaded_base_model_with_lora = None

llama_lora/ui/inference_ui.py CHANGED Viewed

@@ -19,6 +19,7 @@ from ..utils.callbacks import Iteratorize, Stream
 device = get_device()
 default_show_raw = True
 def do_inference(
@@ -37,6 +38,15 @@ def do_inference(
     progress=gr.Progress(track_tqdm=True),
 ):
     try:
         variables = [variable_0, variable_1, variable_2, variable_3,
                      variable_4, variable_5, variable_6, variable_7]
         prompter = Prompter(prompt_template)
@@ -69,12 +79,20 @@ def do_inference(
                         yield out
                 for partial_sentence in word_generator(message):
-                    yield partial_sentence, json.dumps(list(range(len(partial_sentence.split()))), indent=2)
                     time.sleep(0.05)
                 return
             time.sleep(1)
-            yield message, json.dumps(list(range(len(message.split()))), indent=2)
             return
         model = get_base_model()
@@ -100,6 +118,19 @@ def do_inference(
             "max_new_tokens": max_new_tokens,
         }
         if stream_output:
             # Stream the reply 1 token at a time.
             # This is based on the trick of using 'stopping_criteria' to create an iterator,
@@ -131,29 +162,61 @@ def do_inference(
                     raw_output = None
                     if show_raw:
                         raw_output = str(output)
-                    yield prompter.get_response(decoded_output), raw_output
             return  # early return for stream_output
         # Without streaming
         with torch.no_grad():
-            generation_output = model.generate(
-                input_ids=input_ids,
-                generation_config=generation_config,
-                return_dict_in_generate=True,
-                output_scores=True,
-                max_new_tokens=max_new_tokens,
-            )
         s = generation_output.sequences[0]
         output = tokenizer.decode(s)
         raw_output = None
         if show_raw:
             raw_output = str(s)
-        yield prompter.get_response(output), raw_output
     except Exception as e:
         raise gr.Error(e)
 def reload_selections(current_lora_model, current_prompt_template):
     available_template_names = get_available_template_names()
     available_template_names_with_none = available_template_names + ["None"]
@@ -186,7 +249,8 @@ def handle_prompt_template_change(prompt_template, lora_model):
         gr_updates.append(gr.Textbox.update(
             label="Not Used", visible=False))
-    model_prompt_template_message_update = gr.Markdown.update("", visible=False)
     lora_mode_info = get_info_of_available_lora_model(lora_model)
     if lora_mode_info and isinstance(lora_mode_info, dict):
         model_prompt_template = lora_mode_info.get("prompt_template")
@@ -352,7 +416,7 @@ def inference_ui():
             with gr.Column(elem_id="inference_output_group_container"):
                 with gr.Column(elem_id="inference_output_group"):
                     inference_output = gr.Textbox(
-                        lines=12, label="Output", elem_id="inference_output")
                     inference_output.style(show_copy_button=True)
                     with gr.Accordion(
                             "Raw Output",
@@ -413,8 +477,12 @@ def inference_ui():
             outputs=[inference_output, inference_raw_output],
             api_name="inference"
         )
-        stop_btn.click(fn=None, inputs=None, outputs=None,
-                       cancels=[generate_event])
         update_prompt_preview_event = update_prompt_preview_btn.click(fn=update_prompt_preview, inputs=[prompt_template,
                                                                                                         variable_0, variable_1, variable_2, variable_3,
@@ -624,5 +692,27 @@ def inference_ui():
           });
         }
       }, 100);
     }
     """)

 device = get_device()
 default_show_raw = True
+inference_output_lines = 12
 def do_inference(
     progress=gr.Progress(track_tqdm=True),
 ):
     try:
+        if Global.generation_force_stopped_at is not None:
+            required_elapsed_time_after_forced_stop = 1
+            current_unix_time = time.time()
+            remaining_time = required_elapsed_time_after_forced_stop - \
+                (current_unix_time - Global.generation_force_stopped_at)
+            if remaining_time > 0:
+                time.sleep(remaining_time)
+            Global.generation_force_stopped_at = None
         variables = [variable_0, variable_1, variable_2, variable_3,
                      variable_4, variable_5, variable_6, variable_7]
         prompter = Prompter(prompt_template)
                         yield out
                 for partial_sentence in word_generator(message):
+                    yield (
+                        gr.Textbox.update(
+                            value=partial_sentence, lines=inference_output_lines),
+                        json.dumps(
+                            list(range(len(partial_sentence.split()))), indent=2)
+                    )
                     time.sleep(0.05)
                 return
             time.sleep(1)
+            yield (
+                gr.Textbox.update(value=message, lines=1), # TODO
+                json.dumps(list(range(len(message.split()))), indent=2)
+            )
             return
         model = get_base_model()
             "max_new_tokens": max_new_tokens,
         }
+        def ui_generation_stopping_criteria(input_ids, score, **kwargs):
+            if Global.should_stop_generating:
+                return True
+            return False
+        Global.should_stop_generating = False
+        generate_params.setdefault(
+            "stopping_criteria", transformers.StoppingCriteriaList()
+        )
+        generate_params["stopping_criteria"].append(
+            ui_generation_stopping_criteria
+        )
         if stream_output:
             # Stream the reply 1 token at a time.
             # This is based on the trick of using 'stopping_criteria' to create an iterator,
                     raw_output = None
                     if show_raw:
                         raw_output = str(output)
+                    response = prompter.get_response(decoded_output)
+                    if Global.should_stop_generating:
+                        return
+                    yield (
+                        gr.Textbox.update(
+                            value=response, lines=inference_output_lines),
+                        raw_output)
+                    if Global.should_stop_generating:
+                        # If the user stops the generation, and then clicks the
+                        # generation button again, they may mysteriously landed
+                        # here, in the previous, should-be-stopped generation
+                        # function call, with the new generation function not be
+                        # called at all. To workaround this, we yield a message
+                        # and setting lines=1, and if the front-end JS detects
+                        # that lines has been set to 1 (rows="1" in HTML),
+                        # it will automatically click the generate button again
+                        # (gr.Textbox.update() does not support updating
+                        # elem_classes or elem_id).
+                        # [WORKAROUND-UI01]
+                        yield (
+                            gr.Textbox.update(
+                                value="Please retry", lines=1),
+                            None)
             return  # early return for stream_output
         # Without streaming
         with torch.no_grad():
+            generation_output = model.generate(**generate_params)
         s = generation_output.sequences[0]
         output = tokenizer.decode(s)
         raw_output = None
         if show_raw:
             raw_output = str(s)
+        response = prompter.get_response(output)
+        if Global.should_stop_generating:
+            return
+        yield (
+            gr.Textbox.update(value=response, lines=inference_output_lines),
+            raw_output)
     except Exception as e:
         raise gr.Error(e)
+def handle_stop_generate():
+    Global.generation_force_stopped_at = time.time()
+    Global.should_stop_generating = True
 def reload_selections(current_lora_model, current_prompt_template):
     available_template_names = get_available_template_names()
     available_template_names_with_none = available_template_names + ["None"]
         gr_updates.append(gr.Textbox.update(
             label="Not Used", visible=False))
+    model_prompt_template_message_update = gr.Markdown.update(
+        "", visible=False)
     lora_mode_info = get_info_of_available_lora_model(lora_model)
     if lora_mode_info and isinstance(lora_mode_info, dict):
         model_prompt_template = lora_mode_info.get("prompt_template")
             with gr.Column(elem_id="inference_output_group_container"):
                 with gr.Column(elem_id="inference_output_group"):
                     inference_output = gr.Textbox(
+                        lines=inference_output_lines, label="Output", elem_id="inference_output")
                     inference_output.style(show_copy_button=True)
                     with gr.Accordion(
                             "Raw Output",
             outputs=[inference_output, inference_raw_output],
             api_name="inference"
         )
+        stop_btn.click(
+            fn=handle_stop_generate,
+            inputs=None,
+            outputs=None,
+            cancels=[generate_event]
+        )
         update_prompt_preview_event = update_prompt_preview_btn.click(fn=update_prompt_preview, inputs=[prompt_template,
                                                                                                         variable_0, variable_1, variable_2, variable_3,
           });
         }
       }, 100);
+      // [WORKAROUND-UI01]
+      setTimeout(function () {
+        const inference_output_textarea = document.querySelector(
+          '#inference_output textarea'
+        );
+        if (!inference_output_textarea) return;
+        const observer = new MutationObserver(function () {
+          if (inference_output_textarea.getAttribute('rows') === '1') {
+            setTimeout(function () {
+              const inference_generate_btn = document.getElementById(
+                'inference_generate_btn'
+              );
+              if (inference_generate_btn) inference_generate_btn.click();
+            }, 10);
+          }
+        });
+        observer.observe(inference_output_textarea, {
+          attributes: true,
+          attributeFilter: ['rows'],
+        });
+      }, 100);
     }
     """)