pseudotensor commited on
Commit
5b1d132
1 Parent(s): 6dd6b04

Update with h2oGPT hash 2cf0e36c0a86f41add0929b2c9217bfe480ffb58

Browse files
Files changed (3) hide show
  1. generate.py +67 -18
  2. gradio_runner.py +13 -11
  3. utils.py +2 -0
generate.py CHANGED
@@ -36,11 +36,11 @@ eval_extra_columns = ['prompt', 'response', 'score']
36
  def main(
37
  load_8bit: bool = False,
38
  load_half: bool = True,
39
- infer_devices: bool = True, # really if to "control" devices now
40
  base_model: str = '',
41
  tokenizer_base_model: str = '',
42
  lora_weights: str = "",
43
- gpu_id: int = 0, # if infer_devices = True and gpu_id != -1
44
 
45
  prompt_type: Union[int, str] = None,
46
  # input to generation
@@ -61,7 +61,7 @@ def main(
61
  share: bool = True,
62
  local_files_only: bool = False,
63
  resume_download: bool = True,
64
- use_auth_token: Union[str, bool] = False, # True requires CLI did huggingface-cli login before running
65
 
66
  src_lang: str = "English",
67
  tgt_lang: str = "Russian",
@@ -69,20 +69,18 @@ def main(
69
  gradio: bool = True,
70
  gradio_avoid_processing_markdown: bool = False,
71
  chat: bool = True,
72
- chat_history: int = 4096, # character length of chat context/history
73
- chat_context: bool = False, # use default context if human_bot
74
  stream_output: bool = True,
75
  show_examples: bool = None,
76
  verbose: bool = False,
77
  h2ocolors: bool = True,
78
  height: int = 400,
79
  show_lora: bool = True,
80
- # set to True to load --base_model after client logs in,
81
- # to be able to free GPU memory when model is swapped
82
  login_mode_if_model0: bool = False,
83
  block_gradio_exit: bool = True,
84
  concurrency_count: int = 1,
85
- api_open: bool = False, # don't let API skip queue
86
  allow_api: bool = True,
87
  input_lines: int = 1,
88
 
@@ -98,9 +96,64 @@ def main(
98
  eval_sharegpt_prompts_only: int = 0,
99
  eval_sharegpt_prompts_only_seed: int = 1234,
100
  eval_sharegpt_as_output: bool = False,
101
-
102
- hard_stop_list: typing.List[str] = [],
103
  ):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  is_hf = bool(os.getenv("HUGGINGFACE_SPACES"))
105
  is_gpth2oai = bool(os.getenv("GPT_H2O_AI"))
106
  is_public = is_hf or is_gpth2oai # multi-user case with fixed model and disclaimer
@@ -652,7 +705,6 @@ def evaluate(
652
  debug=False,
653
  concurrency_count=None,
654
  save_dir=None,
655
- hard_stop_list=None,
656
  sanitize_bot_response=True,
657
  model_state0=None,
658
  is_low_mem=None,
@@ -714,10 +766,6 @@ def evaluate(
714
  prompter = Prompter(prompt_type, debug=debug, chat=chat, stream_output=stream_output)
715
  prompt = prompter.generate_prompt(data_point)
716
 
717
- if hard_stop_list is None:
718
- # acts like undo on user entry and bot response
719
- hard_stop_list = []
720
-
721
  if isinstance(tokenizer, str):
722
  # pipeline
723
  if tokenizer == "summarization":
@@ -1219,7 +1267,9 @@ def score_qa(smodel, stokenizer, max_length_tokenize, question, answer, cutoff_l
1219
 
1220
 
1221
  if __name__ == "__main__":
1222
- print("""
 
 
1223
  WORLD_SIZE=4 CUDA_VISIBLE_DEVICES="0,1,2,3" torchrun --nproc_per_node=4 --master_port=1234 generate.py --base_model='EleutherAI/gpt-j-6B' --lora_weights=lora-alpaca_6B
1224
  python generate.py --base_model='EleutherAI/gpt-j-6B' --lora_weights='lora-alpaca_6B'
1225
  python generate.py --base_model='EleutherAI/gpt-neox-20b' --lora_weights='lora-alpaca_20B'
@@ -1245,6 +1295,5 @@ if __name__ == "__main__":
1245
  python generate.py --base_model=decapoda-research/llama-65b-hf --load_8bit=False --infer_devices=False --prompt_type='human_bot'
1246
 
1247
  python generate.py --base_model=h2oai/h2ogpt-oig-oasst1-512-6.9b
1248
-
1249
- """, flush=True)
1250
  fire.Fire(main)
 
36
  def main(
37
  load_8bit: bool = False,
38
  load_half: bool = True,
39
+ infer_devices: bool = True,
40
  base_model: str = '',
41
  tokenizer_base_model: str = '',
42
  lora_weights: str = "",
43
+ gpu_id: int = 0,
44
 
45
  prompt_type: Union[int, str] = None,
46
  # input to generation
 
61
  share: bool = True,
62
  local_files_only: bool = False,
63
  resume_download: bool = True,
64
+ use_auth_token: Union[str, bool] = False,
65
 
66
  src_lang: str = "English",
67
  tgt_lang: str = "Russian",
 
69
  gradio: bool = True,
70
  gradio_avoid_processing_markdown: bool = False,
71
  chat: bool = True,
72
+ chat_history: int = 4096,
73
+ chat_context: bool = False,
74
  stream_output: bool = True,
75
  show_examples: bool = None,
76
  verbose: bool = False,
77
  h2ocolors: bool = True,
78
  height: int = 400,
79
  show_lora: bool = True,
 
 
80
  login_mode_if_model0: bool = False,
81
  block_gradio_exit: bool = True,
82
  concurrency_count: int = 1,
83
+ api_open: bool = False,
84
  allow_api: bool = True,
85
  input_lines: int = 1,
86
 
 
96
  eval_sharegpt_prompts_only: int = 0,
97
  eval_sharegpt_prompts_only_seed: int = 1234,
98
  eval_sharegpt_as_output: bool = False,
 
 
99
  ):
100
+ """
101
+
102
+ :param load_8bit: load model in 8-bit using bitsandbytes
103
+ :param load_half: load model in float16
104
+ :param infer_devices: whether to control devices with gpu_id. If False, then spread across GPUs
105
+ :param base_model: model HF-type name
106
+ :param tokenizer_base_model: tokenizer HF-type name
107
+ :param lora_weights: LORA weights path/HF link
108
+ :param gpu_id: if infer_devices, then use gpu_id for cuda device ID, or auto mode if gpu_id != -1
109
+ :param prompt_type: type of prompt, usually matched to fine-tuned model or plain for foundational model
110
+ :param temperature: generation temperature
111
+ :param top_p: generation top_p
112
+ :param top_k: generation top_k
113
+ :param num_beams: generation number of beams
114
+ :param repetition_penalty: generation repetition penalty
115
+ :param num_return_sequences: generation number of sequences (1 forced for chat)
116
+ :param do_sample: generation sample
117
+ :param max_new_tokens: generation max new tokens
118
+ :param min_new_tokens: generation min tokens
119
+ :param early_stopping: generation early stopping
120
+ :param max_time: maximum time to allow for generation
121
+ :param debug: enable debug mode
122
+ :param save_dir: directory chat data is saved to
123
+ :param share: whether to share the gradio app with sharable URL
124
+ :param local_files_only: whether to only use local files instead of doing to HF for models
125
+ :param resume_download: whether to resume downloads from HF for models
126
+ :param use_auth_token: whether to use HF auth token (requires CLI did huggingface-cli login before)
127
+ :param src_lang: source languages to include if doing translation (None = all)
128
+ :param tgt_lang: target languages to include if doing translation (None = all)
129
+ :param gradio: whether to enable gradio, or to enable benchmark mode
130
+ :param gradio_avoid_processing_markdown:
131
+ :param chat: whether to enable chat mode with chat history
132
+ :param chat_history: maximum character length of chat context/history
133
+ :param chat_context: whether to use extra helpful context if human_bot
134
+ :param stream_output: whether to stream output from generate
135
+ :param show_examples: whether to show clickable examples in gradio
136
+ :param verbose: whether to show verbose prints
137
+ :param h2ocolors: whether to use H2O.ai theme
138
+ :param height: height of chat window
139
+ :param show_lora: whether to show LORA options in UI (expert so can be hard to understand)
140
+ :param login_mode_if_model0: set to True to load --base_model after client logs in, to be able to free GPU memory when model is swapped
141
+ :param block_gradio_exit: whether to block gradio exit (used for testing)
142
+ :param concurrency_count: gradio concurrency count (1 is optimal for LLMs)
143
+ :param api_open: If False, don't let API calls skip gradio queue
144
+ :param allow_api: whether to allow API calls at all to gradio server
145
+ :param input_lines: how many input lines to show for chat box (>1 forces shift-enter for submit, else enter is submit)
146
+ :param sanitize_user_prompt: whether to remove profanity from user input
147
+ :param sanitize_bot_response: whether to remove profanity and repeat lines from bot output
148
+ :param extra_model_options: extra models to show in list in gradio
149
+ :param extra_lora_options: extra LORA to show in list in gradio
150
+ :param score_model: which model to score responses (None means no scoring)
151
+ :param auto_score: whether to automatically score responses
152
+ :param eval_sharegpt_prompts_only: for no gradio benchmark, if using ShareGPT prompts for eval
153
+ :param eval_sharegpt_prompts_only_seed: for no gradio benchmark, if seed for ShareGPT sampling
154
+ :param eval_sharegpt_as_output: for no gradio benchmark, whether to test ShareGPT output itself
155
+ :return:
156
+ """
157
  is_hf = bool(os.getenv("HUGGINGFACE_SPACES"))
158
  is_gpth2oai = bool(os.getenv("GPT_H2O_AI"))
159
  is_public = is_hf or is_gpth2oai # multi-user case with fixed model and disclaimer
 
705
  debug=False,
706
  concurrency_count=None,
707
  save_dir=None,
 
708
  sanitize_bot_response=True,
709
  model_state0=None,
710
  is_low_mem=None,
 
766
  prompter = Prompter(prompt_type, debug=debug, chat=chat, stream_output=stream_output)
767
  prompt = prompter.generate_prompt(data_point)
768
 
 
 
 
 
769
  if isinstance(tokenizer, str):
770
  # pipeline
771
  if tokenizer == "summarization":
 
1267
 
1268
 
1269
  if __name__ == "__main__":
1270
+ """
1271
+ Examples:
1272
+
1273
  WORLD_SIZE=4 CUDA_VISIBLE_DEVICES="0,1,2,3" torchrun --nproc_per_node=4 --master_port=1234 generate.py --base_model='EleutherAI/gpt-j-6B' --lora_weights=lora-alpaca_6B
1274
  python generate.py --base_model='EleutherAI/gpt-j-6B' --lora_weights='lora-alpaca_6B'
1275
  python generate.py --base_model='EleutherAI/gpt-neox-20b' --lora_weights='lora-alpaca_20B'
 
1295
  python generate.py --base_model=decapoda-research/llama-65b-hf --load_8bit=False --infer_devices=False --prompt_type='human_bot'
1296
 
1297
  python generate.py --base_model=h2oai/h2ogpt-oig-oasst1-512-6.9b
1298
+ """
 
1299
  fire.Fire(main)
gradio_runner.py CHANGED
@@ -48,16 +48,8 @@ def go_gradio(**kwargs):
48
  Hash: {get_githash()}
49
  """
50
  else:
51
- description = "For more information, visit our GitHub pages: [h2oGPT](https://github.com/h2oai/h2ogpt) and [H2O LLM Studio](https://github.com/h2oai/h2o-llmstudio).<br>"
52
- if is_public:
53
- description += "If this host is busy, try [gpt.h2o.ai 20B](https://gpt.h2o.ai) and [HF Spaces1 12B](https://huggingface.co/spaces/h2oai/h2ogpt-chatbot) and [HF Spaces2 12B](https://huggingface.co/spaces/h2oai/h2ogpt-chatbot2)<br>"
54
- description += """<p><b> DISCLAIMERS: </b><ul><i><li>The model was trained on The Pile and other data, which may contain objectionable content. Use at own risk.</i></li>"""
55
- if kwargs['load_8bit']:
56
- description += """<i><li> Model is loaded in 8-bit and has other restrictions on this host. UX can be worse than non-hosted version.</i></li>"""
57
- description += """<i><li>Conversations may be used to improve h2oGPT. Do not share sensitive information.</i></li>"""
58
- if 'h2ogpt-research' in kwargs['base_model']:
59
- description += """<i><li>Research demonstration only, not used for commercial purposes.</i></li>"""
60
- description += """<i><li>By using h2oGPT, you accept our [Terms of Service](https://github.com/h2oai/h2ogpt/blob/main/tos.md).</i></li></ul></p>"""
61
 
62
  if kwargs['verbose']:
63
  task_info_md = f"""
@@ -371,6 +363,16 @@ def go_gradio(**kwargs):
371
  with gr.Row():
372
  s3up_btn = gr.Button("S3UP")
373
  s3up_text = gr.Textbox(label='S3UP result', interactive=False)
 
 
 
 
 
 
 
 
 
 
374
 
375
  # Get flagged data
376
  zip_data1 = functools.partial(zip_data, root_dirs=['flagged_data_points', kwargs['save_dir']])
@@ -910,7 +912,7 @@ def go_gradio(**kwargs):
910
 
911
 
912
  input_args_list = ['model_state']
913
- inputs_kwargs_list = ['debug', 'save_dir', 'hard_stop_list', 'sanitize_bot_response', 'model_state0', 'is_low_mem',
914
  'raise_generate_gpu_exceptions', 'chat_context', 'concurrency_count', 'lora_weights']
915
 
916
 
 
48
  Hash: {get_githash()}
49
  """
50
  else:
51
+ description = "For more information, visit our GitHub pages: [h2oGPT](https://github.com/h2oai/h2ogpt) and [H2O LLM Studio](https://github.com/h2oai/h2o-llmstudio)<br>"
52
+ description += """<p>By using h2oGPT, you accept our [Terms of Service](https://github.com/h2oai/h2ogpt/blob/main/tos.md)</p>"""
 
 
 
 
 
 
 
 
53
 
54
  if kwargs['verbose']:
55
  task_info_md = f"""
 
363
  with gr.Row():
364
  s3up_btn = gr.Button("S3UP")
365
  s3up_text = gr.Textbox(label='S3UP result', interactive=False)
366
+ with gr.TabItem("Disclaimers"):
367
+ description = ""
368
+ description += """<p><b> DISCLAIMERS: </b><ul><i><li>The model was trained on The Pile and other data, which may contain objectionable content. Use at own risk.</i></li>"""
369
+ if kwargs['load_8bit']:
370
+ description += """<i><li> Model is loaded in 8-bit and has other restrictions on this host. UX can be worse than non-hosted version.</i></li>"""
371
+ description += """<i><li>Conversations may be used to improve h2oGPT. Do not share sensitive information.</i></li>"""
372
+ if 'h2ogpt-research' in kwargs['base_model']:
373
+ description += """<i><li>Research demonstration only, not used for commercial purposes.</i></li>"""
374
+ description += """<i><li>By using h2oGPT, you accept our <a href="https://github.com/h2oai/h2ogpt/blob/main/tos.md">Terms of Service</a></i></li></ul></p>"""
375
+ gr.Markdown(value=description, show_label=False, interactive=False)
376
 
377
  # Get flagged data
378
  zip_data1 = functools.partial(zip_data, root_dirs=['flagged_data_points', kwargs['save_dir']])
 
912
 
913
 
914
  input_args_list = ['model_state']
915
+ inputs_kwargs_list = ['debug', 'save_dir', 'sanitize_bot_response', 'model_state0', 'is_low_mem',
916
  'raise_generate_gpu_exceptions', 'chat_context', 'concurrency_count', 'lora_weights']
917
 
918
 
utils.py CHANGED
@@ -96,6 +96,8 @@ def system_info():
96
  for k, v in gpu_memory_frac_dict.items():
97
  system[f'GPU_M/%s' % k] = v
98
 
 
 
99
  return system
100
 
101
 
 
96
  for k, v in gpu_memory_frac_dict.items():
97
  system[f'GPU_M/%s' % k] = v
98
 
99
+ system['hash'] = get_githash()
100
+
101
  return system
102
 
103