Spaces:

h2oai
/

h2ogpt-chatbot

Runtime error

App Files Files Community

pseudotensor commited on Apr 20, 2023

Commit

0a5ce48

•

1 Parent(s): 190bc9c

Update with h2ogpt hash 24c76a5944a7bc0ee6249ecab5ff915592771e88

Browse files

Files changed (1) hide show

app.py +37 -25

app.py CHANGED Viewed

@@ -27,6 +27,11 @@ from finetune import get_loaders, example_data_points, generate_prompt, get_gith
     human, bot, prompt_type_to_model_name, inv_prompt_type_to_model_lower
 from stopping import CallbackToGenerator, Stream, StoppingCriteriaSub
 def main(
         load_8bit: bool = False,
@@ -90,15 +95,22 @@ def main(
 ):
     # allow set token directly
     use_auth_token = os.environ.get("HUGGINGFACE_API_TOKEN", use_auth_token)
-    # override share if in spaces
-    if os.environ.get("HUGGINGFACE_SPACES"):
-        share = False
-        base_model = 'h2oai/h2ogpt-oasst1-512-12b'
-        load_8bit = True
-        temperature = 0.7
-        top_p = 1
-        top_k = 100
         do_sample = True
     # get defaults
     model_lower = base_model.lower()
@@ -202,7 +214,7 @@ def main(
                             assert ex[1] in [None, '']  # should be no iinput
                             assert ex[2] in [None, '']  # should be no context
                             prompt = ex[0]
-                        cutoff_len = 768 if os.environ.get("HUGGINGFACE_SPACES") else 2048
                         inputs = stokenizer(prompt, res,
                                             return_tensors="pt",
                                             truncation=True,
@@ -526,11 +538,11 @@ def go_gradio(**kwargs):
                       """
     else:
         description = "For more information, visit [the project's website](https://github.com/h2oai/h2ogpt).<br>"
-    if os.environ.get("HUGGINGFACE_SPACES"):
         description += """<p><b> DISCLAIMERS: </b><ul><i><li>The data used to train this model include The Pile and other sources. These may contain objectionable content, so the model may reproduce that material. Use application and responses at own risk.</i></li>"""
         if kwargs['load_8bit']:
-            description += """<i><li> Model is loaded in 8-bit and HF spaces version has other limitations in order to fit on HF GPUs, so UX can be worse than native app.</i></li>"""
-        description += """<i><li>Model loading and unloading disabled on HF SPACES to avoid GPU OOM for multi-user environment.</i></li></ul></p>"""
     if kwargs['verbose']:
         task_info_md = f"""
@@ -617,7 +629,7 @@ body{background-image:url("https://h2o.ai/content/experience-fragments/h2o/us/en
             {description}
             {task_info_md}
             """)
-        if os.environ.get("HUGGINGFACE_SPACES"):
             gr.HTML('''<center><a href="https://huggingface.co/spaces/h2oai/h2ogpt-chatbot?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>Duplicate this Space to skip the queue and run in a private space</center>''')
         # go button visible if
@@ -685,7 +697,7 @@ body{background-image:url("https://h2o.ai/content/experience-fragments/h2o/us/en
                                                                    value=kwargs['stream_output'])
                             prompt_type = gr.Dropdown(prompt_types_strings,
                                                       value=kwargs['prompt_type'], label="Prompt Type",
-                                                      visible=not os.environ.get("HUGGINGFACE_SPACES"))
                             temperature = gr.Slider(minimum=0, maximum=3,
                                                     value=kwargs['temperature'],
                                                     label="Temperature",
@@ -698,12 +710,12 @@ body{background-image:url("https://h2o.ai/content/experience-fragments/h2o/us/en
                                 value=kwargs['top_k'], label="Top k",
                                 info='Num. tokens to sample from'
                             )
-                            max_beams = 8 if not os.environ.get("HUGGINGFACE_SPACES") else 2
                             num_beams = gr.Slider(minimum=1, maximum=max_beams, step=1,
                                                   value=min(max_beams, kwargs['num_beams']), label="Beams",
                                                   info="Number of searches for optimal overall probability.  "
                                                        "Uses more GPU memory/compute")
-                            max_max_new_tokens = 2048 if not os.environ.get("HUGGINGFACE_SPACES") else kwargs['max_new_tokens']
                             max_new_tokens = gr.Slider(
                                 minimum=1, maximum=max_max_new_tokens, step=1,
                                 value=min(max_max_new_tokens, kwargs['max_new_tokens']), label="Max output length",
@@ -714,7 +726,7 @@ body{background-image:url("https://h2o.ai/content/experience-fragments/h2o/us/en
                             )
                             early_stopping = gr.Checkbox(label="EarlyStopping", info="Stop early in beam search",
                                                          value=kwargs['early_stopping'])
-                            max_max_time = 60 * 5 if not os.environ.get("HUGGINGFACE_SPACES") else 60
                             max_time = gr.Slider(minimum=0, maximum=max_max_time, step=1,
                                                  value=min(max_max_time, kwargs['max_time']), label="Max. time",
                                                  info="Max. time to search optimal output.")
@@ -724,17 +736,17 @@ body{background-image:url("https://h2o.ai/content/experience-fragments/h2o/us/en
                             num_return_sequences = gr.Slider(minimum=1, maximum=10, step=1,
                                                              value=kwargs['num_return_sequences'],
                                                              label="Number Returns", info="Must be <= num_beams",
-                                                             visible=not os.environ.get("HUGGINGFACE_SPACES"))
                             do_sample = gr.Checkbox(label="Sample", info="Sample, for diverse output(s)",
                                                     value=kwargs['do_sample'])
                             if kwargs['chat']:
                                 iinput = gr.Textbox(lines=4, label="Input",
                                                     placeholder=kwargs['placeholder_input'],
-                                                    visible=not os.environ.get("HUGGINGFACE_SPACES"))
                             # nominally empty for chat mode
                             context = gr.Textbox(lines=1, label="Context",
                                                  info="Ignored in chat mode.",
-                                                 visible=not os.environ.get("HUGGINGFACE_SPACES"))
                 with gr.TabItem("Models"):
                     with gr.Row():
@@ -744,8 +756,8 @@ body{background-image:url("https://h2o.ai/content/experience-fragments/h2o/us/en
                                     model_choice = gr.Dropdown(model_options_state.value[0], label="Choose Model", value=kwargs['base_model'])
                                     lora_choice = gr.Dropdown(lora_options_state.value[0], label="Choose LORA", value=kwargs['lora_weights'], visible=kwargs['show_lora'])
                                 with gr.Column(scale=1):
-                                    load_msg = "Load Model/LORA" if not os.environ.get("HUGGINGFACE_SPACES") \
-                                        else "LOAD DISABLED ON HF SPACES"
                                     load_model_button = gr.Button(load_msg)
                                     model_used = gr.Textbox(label="Current Model", value=kwargs['base_model'])
                                     lora_used = gr.Textbox(label="Current LORA", value=kwargs['lora_weights'], visible=kwargs['show_lora'])
@@ -811,7 +823,7 @@ body{background-image:url("https://h2o.ai/content/experience-fragments/h2o/us/en
                     len(history[-1]) >= 2:
                 os.environ['TOKENIZERS_PARALLELISM'] = 'false'
-                max_length_tokenize = 512 if os.environ.get("HUGGINGFACE_SPACES") else 2048
                 cutoff_len = max_length_tokenize*4  # restrict deberta related to max for LLM
                 question = history[-1][0]
@@ -1025,7 +1037,7 @@ body{background-image:url("https://h2o.ai/content/experience-fragments/h2o/us/en
                                outputs=[model_state, model_used, lora_used, prompt_type])
         prompt_update_args = dict(fn=dropdown_prompt_type_list, inputs=prompt_type, outputs=prompt_type)
         chatbot_update_args = dict(fn=chatbot_list, inputs=[text_output, model_used], outputs=text_output)
-        if not os.environ.get("HUGGINGFACE_SPACES"):
             load_model_event = load_model_button.click(**load_model_args) \
                                                  .then(**prompt_update_args) \
                                                  .then(**chatbot_update_args) \
@@ -1243,7 +1255,7 @@ def evaluate(
     # RuntimeError: The size of tensor a (2048) must match the size of tensor b (2049) at non-singleton dimension 3
     # RuntimeError: expected scalar type Half but found Float
     # with - 256
-    max_length_tokenize = 768 - 256 if os.environ.get("HUGGINGFACE_SPACES") else 2048 - 256
     cutoff_len = max_length_tokenize * 4  # if reaches limit, then can't generate new tokens
     output_smallest = 30 * 4
     prompt = prompt[-cutoff_len - output_smallest:]

     human, bot, prompt_type_to_model_name, inv_prompt_type_to_model_lower
 from stopping import CallbackToGenerator, Stream, StoppingCriteriaSub
+is_hf = os.getenv("HUGGINGFACE_SPACES")
+is_gpth2oai = os.getenv("GPT_H2O_AI")
+is_public = is_hf or is_gpth2oai  # multi-user case with fixed model and disclaimer
+is_low_mem = is_hf  # assumes run on 24GB consumer GPU
 def main(
         load_8bit: bool = False,
 ):
     # allow set token directly
     use_auth_token = os.environ.get("HUGGINGFACE_API_TOKEN", use_auth_token)
+    if is_public:
+        temperature = 0.4
+        top_p = 0.85
+        top_k = 70
         do_sample = True
+        if is_low_mem:
+            base_model = 'h2oai/h2ogpt-oasst1-512-12b'
+            load_8bit = True
+        else:
+            base_model = 'h2oai/h2ogpt-oasst1-512-20b'
+    if is_low_mem:
+        load_8bit = True
+    if is_hf:
+        # must override share if in spaces
+        share = False
     # get defaults
     model_lower = base_model.lower()
                             assert ex[1] in [None, '']  # should be no iinput
                             assert ex[2] in [None, '']  # should be no context
                             prompt = ex[0]
+                        cutoff_len = 768 if is_low_mem else 2048
                         inputs = stokenizer(prompt, res,
                                             return_tensors="pt",
                                             truncation=True,
                       """
     else:
         description = "For more information, visit [the project's website](https://github.com/h2oai/h2ogpt).<br>"
+    if is_public:
         description += """<p><b> DISCLAIMERS: </b><ul><i><li>The data used to train this model include The Pile and other sources. These may contain objectionable content, so the model may reproduce that material. Use application and responses at own risk.</i></li>"""
         if kwargs['load_8bit']:
+            description += """<i><li> Model is loaded in 8-bit and with other limitations in order to fit on GPUs with lower amounts of VRAM, so UX can be worse than non-hosted version.</i></li>"""
+        description += """<i><li>Model loading and unloading disabled to avoid GPU OOM for multi-user environment.</i></li></ul></p>"""
     if kwargs['verbose']:
         task_info_md = f"""
             {description}
             {task_info_md}
             """)
+        if is_hf:
             gr.HTML('''<center><a href="https://huggingface.co/spaces/h2oai/h2ogpt-chatbot?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>Duplicate this Space to skip the queue and run in a private space</center>''')
         # go button visible if
                                                                    value=kwargs['stream_output'])
                             prompt_type = gr.Dropdown(prompt_types_strings,
                                                       value=kwargs['prompt_type'], label="Prompt Type",
+                                                      visible=not is_public)
                             temperature = gr.Slider(minimum=0, maximum=3,
                                                     value=kwargs['temperature'],
                                                     label="Temperature",
                                 value=kwargs['top_k'], label="Top k",
                                 info='Num. tokens to sample from'
                             )
+                            max_beams = 8 if not is_low_mem else 2
                             num_beams = gr.Slider(minimum=1, maximum=max_beams, step=1,
                                                   value=min(max_beams, kwargs['num_beams']), label="Beams",
                                                   info="Number of searches for optimal overall probability.  "
                                                        "Uses more GPU memory/compute")
+                            max_max_new_tokens = 2048 if not is_low_mem else kwargs['max_new_tokens']
                             max_new_tokens = gr.Slider(
                                 minimum=1, maximum=max_max_new_tokens, step=1,
                                 value=min(max_max_new_tokens, kwargs['max_new_tokens']), label="Max output length",
                             )
                             early_stopping = gr.Checkbox(label="EarlyStopping", info="Stop early in beam search",
                                                          value=kwargs['early_stopping'])
+                            max_max_time = 60 * 5 if not is_low_mem else 60
                             max_time = gr.Slider(minimum=0, maximum=max_max_time, step=1,
                                                  value=min(max_max_time, kwargs['max_time']), label="Max. time",
                                                  info="Max. time to search optimal output.")
                             num_return_sequences = gr.Slider(minimum=1, maximum=10, step=1,
                                                              value=kwargs['num_return_sequences'],
                                                              label="Number Returns", info="Must be <= num_beams",
+                                                             visible=not is_public)
                             do_sample = gr.Checkbox(label="Sample", info="Sample, for diverse output(s)",
                                                     value=kwargs['do_sample'])
                             if kwargs['chat']:
                                 iinput = gr.Textbox(lines=4, label="Input",
                                                     placeholder=kwargs['placeholder_input'],
+                                                    visible=not is_public)
                             # nominally empty for chat mode
                             context = gr.Textbox(lines=1, label="Context",
                                                  info="Ignored in chat mode.",
+                                                 visible=not is_public)
                 with gr.TabItem("Models"):
                     with gr.Row():
                                     model_choice = gr.Dropdown(model_options_state.value[0], label="Choose Model", value=kwargs['base_model'])
                                     lora_choice = gr.Dropdown(lora_options_state.value[0], label="Choose LORA", value=kwargs['lora_weights'], visible=kwargs['show_lora'])
                                 with gr.Column(scale=1):
+                                    load_msg = "Load Model/LORA" if not is_public \
+                                        else "LOAD DISABLED FOR HOSTED DEMO"
                                     load_model_button = gr.Button(load_msg)
                                     model_used = gr.Textbox(label="Current Model", value=kwargs['base_model'])
                                     lora_used = gr.Textbox(label="Current LORA", value=kwargs['lora_weights'], visible=kwargs['show_lora'])
                     len(history[-1]) >= 2:
                 os.environ['TOKENIZERS_PARALLELISM'] = 'false'
+                max_length_tokenize = 512 if is_low_mem else 2048
                 cutoff_len = max_length_tokenize*4  # restrict deberta related to max for LLM
                 question = history[-1][0]
                                outputs=[model_state, model_used, lora_used, prompt_type])
         prompt_update_args = dict(fn=dropdown_prompt_type_list, inputs=prompt_type, outputs=prompt_type)
         chatbot_update_args = dict(fn=chatbot_list, inputs=[text_output, model_used], outputs=text_output)
+        if not is_public:
             load_model_event = load_model_button.click(**load_model_args) \
                                                  .then(**prompt_update_args) \
                                                  .then(**chatbot_update_args) \
     # RuntimeError: The size of tensor a (2048) must match the size of tensor b (2049) at non-singleton dimension 3
     # RuntimeError: expected scalar type Half but found Float
     # with - 256
+    max_length_tokenize = 768 - 256 if is_low_mem else 2048 - 256
     cutoff_len = max_length_tokenize * 4  # if reaches limit, then can't generate new tokens
     output_smallest = 30 * 4
     prompt = prompt[-cutoff_len - output_smallest:]