Spaces:
Runtime error
Runtime error
Commit
·
5b1d132
1
Parent(s):
6dd6b04
Update with h2oGPT hash 2cf0e36c0a86f41add0929b2c9217bfe480ffb58
Browse files- generate.py +67 -18
- gradio_runner.py +13 -11
- utils.py +2 -0
generate.py
CHANGED
|
@@ -36,11 +36,11 @@ eval_extra_columns = ['prompt', 'response', 'score']
|
|
| 36 |
def main(
|
| 37 |
load_8bit: bool = False,
|
| 38 |
load_half: bool = True,
|
| 39 |
-
infer_devices: bool = True,
|
| 40 |
base_model: str = '',
|
| 41 |
tokenizer_base_model: str = '',
|
| 42 |
lora_weights: str = "",
|
| 43 |
-
gpu_id: int = 0,
|
| 44 |
|
| 45 |
prompt_type: Union[int, str] = None,
|
| 46 |
# input to generation
|
|
@@ -61,7 +61,7 @@ def main(
|
|
| 61 |
share: bool = True,
|
| 62 |
local_files_only: bool = False,
|
| 63 |
resume_download: bool = True,
|
| 64 |
-
use_auth_token: Union[str, bool] = False,
|
| 65 |
|
| 66 |
src_lang: str = "English",
|
| 67 |
tgt_lang: str = "Russian",
|
|
@@ -69,20 +69,18 @@ def main(
|
|
| 69 |
gradio: bool = True,
|
| 70 |
gradio_avoid_processing_markdown: bool = False,
|
| 71 |
chat: bool = True,
|
| 72 |
-
chat_history: int = 4096,
|
| 73 |
-
chat_context: bool = False,
|
| 74 |
stream_output: bool = True,
|
| 75 |
show_examples: bool = None,
|
| 76 |
verbose: bool = False,
|
| 77 |
h2ocolors: bool = True,
|
| 78 |
height: int = 400,
|
| 79 |
show_lora: bool = True,
|
| 80 |
-
# set to True to load --base_model after client logs in,
|
| 81 |
-
# to be able to free GPU memory when model is swapped
|
| 82 |
login_mode_if_model0: bool = False,
|
| 83 |
block_gradio_exit: bool = True,
|
| 84 |
concurrency_count: int = 1,
|
| 85 |
-
api_open: bool = False,
|
| 86 |
allow_api: bool = True,
|
| 87 |
input_lines: int = 1,
|
| 88 |
|
|
@@ -98,9 +96,64 @@ def main(
|
|
| 98 |
eval_sharegpt_prompts_only: int = 0,
|
| 99 |
eval_sharegpt_prompts_only_seed: int = 1234,
|
| 100 |
eval_sharegpt_as_output: bool = False,
|
| 101 |
-
|
| 102 |
-
hard_stop_list: typing.List[str] = [],
|
| 103 |
):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
is_hf = bool(os.getenv("HUGGINGFACE_SPACES"))
|
| 105 |
is_gpth2oai = bool(os.getenv("GPT_H2O_AI"))
|
| 106 |
is_public = is_hf or is_gpth2oai # multi-user case with fixed model and disclaimer
|
|
@@ -652,7 +705,6 @@ def evaluate(
|
|
| 652 |
debug=False,
|
| 653 |
concurrency_count=None,
|
| 654 |
save_dir=None,
|
| 655 |
-
hard_stop_list=None,
|
| 656 |
sanitize_bot_response=True,
|
| 657 |
model_state0=None,
|
| 658 |
is_low_mem=None,
|
|
@@ -714,10 +766,6 @@ def evaluate(
|
|
| 714 |
prompter = Prompter(prompt_type, debug=debug, chat=chat, stream_output=stream_output)
|
| 715 |
prompt = prompter.generate_prompt(data_point)
|
| 716 |
|
| 717 |
-
if hard_stop_list is None:
|
| 718 |
-
# acts like undo on user entry and bot response
|
| 719 |
-
hard_stop_list = []
|
| 720 |
-
|
| 721 |
if isinstance(tokenizer, str):
|
| 722 |
# pipeline
|
| 723 |
if tokenizer == "summarization":
|
|
@@ -1219,7 +1267,9 @@ def score_qa(smodel, stokenizer, max_length_tokenize, question, answer, cutoff_l
|
|
| 1219 |
|
| 1220 |
|
| 1221 |
if __name__ == "__main__":
|
| 1222 |
-
|
|
|
|
|
|
|
| 1223 |
WORLD_SIZE=4 CUDA_VISIBLE_DEVICES="0,1,2,3" torchrun --nproc_per_node=4 --master_port=1234 generate.py --base_model='EleutherAI/gpt-j-6B' --lora_weights=lora-alpaca_6B
|
| 1224 |
python generate.py --base_model='EleutherAI/gpt-j-6B' --lora_weights='lora-alpaca_6B'
|
| 1225 |
python generate.py --base_model='EleutherAI/gpt-neox-20b' --lora_weights='lora-alpaca_20B'
|
|
@@ -1245,6 +1295,5 @@ if __name__ == "__main__":
|
|
| 1245 |
python generate.py --base_model=decapoda-research/llama-65b-hf --load_8bit=False --infer_devices=False --prompt_type='human_bot'
|
| 1246 |
|
| 1247 |
python generate.py --base_model=h2oai/h2ogpt-oig-oasst1-512-6.9b
|
| 1248 |
-
|
| 1249 |
-
""", flush=True)
|
| 1250 |
fire.Fire(main)
|
|
|
|
| 36 |
def main(
|
| 37 |
load_8bit: bool = False,
|
| 38 |
load_half: bool = True,
|
| 39 |
+
infer_devices: bool = True,
|
| 40 |
base_model: str = '',
|
| 41 |
tokenizer_base_model: str = '',
|
| 42 |
lora_weights: str = "",
|
| 43 |
+
gpu_id: int = 0,
|
| 44 |
|
| 45 |
prompt_type: Union[int, str] = None,
|
| 46 |
# input to generation
|
|
|
|
| 61 |
share: bool = True,
|
| 62 |
local_files_only: bool = False,
|
| 63 |
resume_download: bool = True,
|
| 64 |
+
use_auth_token: Union[str, bool] = False,
|
| 65 |
|
| 66 |
src_lang: str = "English",
|
| 67 |
tgt_lang: str = "Russian",
|
|
|
|
| 69 |
gradio: bool = True,
|
| 70 |
gradio_avoid_processing_markdown: bool = False,
|
| 71 |
chat: bool = True,
|
| 72 |
+
chat_history: int = 4096,
|
| 73 |
+
chat_context: bool = False,
|
| 74 |
stream_output: bool = True,
|
| 75 |
show_examples: bool = None,
|
| 76 |
verbose: bool = False,
|
| 77 |
h2ocolors: bool = True,
|
| 78 |
height: int = 400,
|
| 79 |
show_lora: bool = True,
|
|
|
|
|
|
|
| 80 |
login_mode_if_model0: bool = False,
|
| 81 |
block_gradio_exit: bool = True,
|
| 82 |
concurrency_count: int = 1,
|
| 83 |
+
api_open: bool = False,
|
| 84 |
allow_api: bool = True,
|
| 85 |
input_lines: int = 1,
|
| 86 |
|
|
|
|
| 96 |
eval_sharegpt_prompts_only: int = 0,
|
| 97 |
eval_sharegpt_prompts_only_seed: int = 1234,
|
| 98 |
eval_sharegpt_as_output: bool = False,
|
|
|
|
|
|
|
| 99 |
):
|
| 100 |
+
"""
|
| 101 |
+
|
| 102 |
+
:param load_8bit: load model in 8-bit using bitsandbytes
|
| 103 |
+
:param load_half: load model in float16
|
| 104 |
+
:param infer_devices: whether to control devices with gpu_id. If False, then spread across GPUs
|
| 105 |
+
:param base_model: model HF-type name
|
| 106 |
+
:param tokenizer_base_model: tokenizer HF-type name
|
| 107 |
+
:param lora_weights: LORA weights path/HF link
|
| 108 |
+
:param gpu_id: if infer_devices, then use gpu_id for cuda device ID, or auto mode if gpu_id != -1
|
| 109 |
+
:param prompt_type: type of prompt, usually matched to fine-tuned model or plain for foundational model
|
| 110 |
+
:param temperature: generation temperature
|
| 111 |
+
:param top_p: generation top_p
|
| 112 |
+
:param top_k: generation top_k
|
| 113 |
+
:param num_beams: generation number of beams
|
| 114 |
+
:param repetition_penalty: generation repetition penalty
|
| 115 |
+
:param num_return_sequences: generation number of sequences (1 forced for chat)
|
| 116 |
+
:param do_sample: generation sample
|
| 117 |
+
:param max_new_tokens: generation max new tokens
|
| 118 |
+
:param min_new_tokens: generation min tokens
|
| 119 |
+
:param early_stopping: generation early stopping
|
| 120 |
+
:param max_time: maximum time to allow for generation
|
| 121 |
+
:param debug: enable debug mode
|
| 122 |
+
:param save_dir: directory chat data is saved to
|
| 123 |
+
:param share: whether to share the gradio app with sharable URL
|
| 124 |
+
:param local_files_only: whether to only use local files instead of doing to HF for models
|
| 125 |
+
:param resume_download: whether to resume downloads from HF for models
|
| 126 |
+
:param use_auth_token: whether to use HF auth token (requires CLI did huggingface-cli login before)
|
| 127 |
+
:param src_lang: source languages to include if doing translation (None = all)
|
| 128 |
+
:param tgt_lang: target languages to include if doing translation (None = all)
|
| 129 |
+
:param gradio: whether to enable gradio, or to enable benchmark mode
|
| 130 |
+
:param gradio_avoid_processing_markdown:
|
| 131 |
+
:param chat: whether to enable chat mode with chat history
|
| 132 |
+
:param chat_history: maximum character length of chat context/history
|
| 133 |
+
:param chat_context: whether to use extra helpful context if human_bot
|
| 134 |
+
:param stream_output: whether to stream output from generate
|
| 135 |
+
:param show_examples: whether to show clickable examples in gradio
|
| 136 |
+
:param verbose: whether to show verbose prints
|
| 137 |
+
:param h2ocolors: whether to use H2O.ai theme
|
| 138 |
+
:param height: height of chat window
|
| 139 |
+
:param show_lora: whether to show LORA options in UI (expert so can be hard to understand)
|
| 140 |
+
:param login_mode_if_model0: set to True to load --base_model after client logs in, to be able to free GPU memory when model is swapped
|
| 141 |
+
:param block_gradio_exit: whether to block gradio exit (used for testing)
|
| 142 |
+
:param concurrency_count: gradio concurrency count (1 is optimal for LLMs)
|
| 143 |
+
:param api_open: If False, don't let API calls skip gradio queue
|
| 144 |
+
:param allow_api: whether to allow API calls at all to gradio server
|
| 145 |
+
:param input_lines: how many input lines to show for chat box (>1 forces shift-enter for submit, else enter is submit)
|
| 146 |
+
:param sanitize_user_prompt: whether to remove profanity from user input
|
| 147 |
+
:param sanitize_bot_response: whether to remove profanity and repeat lines from bot output
|
| 148 |
+
:param extra_model_options: extra models to show in list in gradio
|
| 149 |
+
:param extra_lora_options: extra LORA to show in list in gradio
|
| 150 |
+
:param score_model: which model to score responses (None means no scoring)
|
| 151 |
+
:param auto_score: whether to automatically score responses
|
| 152 |
+
:param eval_sharegpt_prompts_only: for no gradio benchmark, if using ShareGPT prompts for eval
|
| 153 |
+
:param eval_sharegpt_prompts_only_seed: for no gradio benchmark, if seed for ShareGPT sampling
|
| 154 |
+
:param eval_sharegpt_as_output: for no gradio benchmark, whether to test ShareGPT output itself
|
| 155 |
+
:return:
|
| 156 |
+
"""
|
| 157 |
is_hf = bool(os.getenv("HUGGINGFACE_SPACES"))
|
| 158 |
is_gpth2oai = bool(os.getenv("GPT_H2O_AI"))
|
| 159 |
is_public = is_hf or is_gpth2oai # multi-user case with fixed model and disclaimer
|
|
|
|
| 705 |
debug=False,
|
| 706 |
concurrency_count=None,
|
| 707 |
save_dir=None,
|
|
|
|
| 708 |
sanitize_bot_response=True,
|
| 709 |
model_state0=None,
|
| 710 |
is_low_mem=None,
|
|
|
|
| 766 |
prompter = Prompter(prompt_type, debug=debug, chat=chat, stream_output=stream_output)
|
| 767 |
prompt = prompter.generate_prompt(data_point)
|
| 768 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 769 |
if isinstance(tokenizer, str):
|
| 770 |
# pipeline
|
| 771 |
if tokenizer == "summarization":
|
|
|
|
| 1267 |
|
| 1268 |
|
| 1269 |
if __name__ == "__main__":
|
| 1270 |
+
"""
|
| 1271 |
+
Examples:
|
| 1272 |
+
|
| 1273 |
WORLD_SIZE=4 CUDA_VISIBLE_DEVICES="0,1,2,3" torchrun --nproc_per_node=4 --master_port=1234 generate.py --base_model='EleutherAI/gpt-j-6B' --lora_weights=lora-alpaca_6B
|
| 1274 |
python generate.py --base_model='EleutherAI/gpt-j-6B' --lora_weights='lora-alpaca_6B'
|
| 1275 |
python generate.py --base_model='EleutherAI/gpt-neox-20b' --lora_weights='lora-alpaca_20B'
|
|
|
|
| 1295 |
python generate.py --base_model=decapoda-research/llama-65b-hf --load_8bit=False --infer_devices=False --prompt_type='human_bot'
|
| 1296 |
|
| 1297 |
python generate.py --base_model=h2oai/h2ogpt-oig-oasst1-512-6.9b
|
| 1298 |
+
"""
|
|
|
|
| 1299 |
fire.Fire(main)
|
gradio_runner.py
CHANGED
|
@@ -48,16 +48,8 @@ def go_gradio(**kwargs):
|
|
| 48 |
Hash: {get_githash()}
|
| 49 |
"""
|
| 50 |
else:
|
| 51 |
-
description = "For more information, visit our GitHub pages: [h2oGPT](https://github.com/h2oai/h2ogpt) and [H2O LLM Studio](https://github.com/h2oai/h2o-llmstudio)
|
| 52 |
-
|
| 53 |
-
description += "If this host is busy, try [gpt.h2o.ai 20B](https://gpt.h2o.ai) and [HF Spaces1 12B](https://huggingface.co/spaces/h2oai/h2ogpt-chatbot) and [HF Spaces2 12B](https://huggingface.co/spaces/h2oai/h2ogpt-chatbot2)<br>"
|
| 54 |
-
description += """<p><b> DISCLAIMERS: </b><ul><i><li>The model was trained on The Pile and other data, which may contain objectionable content. Use at own risk.</i></li>"""
|
| 55 |
-
if kwargs['load_8bit']:
|
| 56 |
-
description += """<i><li> Model is loaded in 8-bit and has other restrictions on this host. UX can be worse than non-hosted version.</i></li>"""
|
| 57 |
-
description += """<i><li>Conversations may be used to improve h2oGPT. Do not share sensitive information.</i></li>"""
|
| 58 |
-
if 'h2ogpt-research' in kwargs['base_model']:
|
| 59 |
-
description += """<i><li>Research demonstration only, not used for commercial purposes.</i></li>"""
|
| 60 |
-
description += """<i><li>By using h2oGPT, you accept our [Terms of Service](https://github.com/h2oai/h2ogpt/blob/main/tos.md).</i></li></ul></p>"""
|
| 61 |
|
| 62 |
if kwargs['verbose']:
|
| 63 |
task_info_md = f"""
|
|
@@ -371,6 +363,16 @@ def go_gradio(**kwargs):
|
|
| 371 |
with gr.Row():
|
| 372 |
s3up_btn = gr.Button("S3UP")
|
| 373 |
s3up_text = gr.Textbox(label='S3UP result', interactive=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 374 |
|
| 375 |
# Get flagged data
|
| 376 |
zip_data1 = functools.partial(zip_data, root_dirs=['flagged_data_points', kwargs['save_dir']])
|
|
@@ -910,7 +912,7 @@ def go_gradio(**kwargs):
|
|
| 910 |
|
| 911 |
|
| 912 |
input_args_list = ['model_state']
|
| 913 |
-
inputs_kwargs_list = ['debug', 'save_dir', '
|
| 914 |
'raise_generate_gpu_exceptions', 'chat_context', 'concurrency_count', 'lora_weights']
|
| 915 |
|
| 916 |
|
|
|
|
| 48 |
Hash: {get_githash()}
|
| 49 |
"""
|
| 50 |
else:
|
| 51 |
+
description = "For more information, visit our GitHub pages: [h2oGPT](https://github.com/h2oai/h2ogpt) and [H2O LLM Studio](https://github.com/h2oai/h2o-llmstudio)<br>"
|
| 52 |
+
description += """<p>By using h2oGPT, you accept our [Terms of Service](https://github.com/h2oai/h2ogpt/blob/main/tos.md)</p>"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
if kwargs['verbose']:
|
| 55 |
task_info_md = f"""
|
|
|
|
| 363 |
with gr.Row():
|
| 364 |
s3up_btn = gr.Button("S3UP")
|
| 365 |
s3up_text = gr.Textbox(label='S3UP result', interactive=False)
|
| 366 |
+
with gr.TabItem("Disclaimers"):
|
| 367 |
+
description = ""
|
| 368 |
+
description += """<p><b> DISCLAIMERS: </b><ul><i><li>The model was trained on The Pile and other data, which may contain objectionable content. Use at own risk.</i></li>"""
|
| 369 |
+
if kwargs['load_8bit']:
|
| 370 |
+
description += """<i><li> Model is loaded in 8-bit and has other restrictions on this host. UX can be worse than non-hosted version.</i></li>"""
|
| 371 |
+
description += """<i><li>Conversations may be used to improve h2oGPT. Do not share sensitive information.</i></li>"""
|
| 372 |
+
if 'h2ogpt-research' in kwargs['base_model']:
|
| 373 |
+
description += """<i><li>Research demonstration only, not used for commercial purposes.</i></li>"""
|
| 374 |
+
description += """<i><li>By using h2oGPT, you accept our <a href="https://github.com/h2oai/h2ogpt/blob/main/tos.md">Terms of Service</a></i></li></ul></p>"""
|
| 375 |
+
gr.Markdown(value=description, show_label=False, interactive=False)
|
| 376 |
|
| 377 |
# Get flagged data
|
| 378 |
zip_data1 = functools.partial(zip_data, root_dirs=['flagged_data_points', kwargs['save_dir']])
|
|
|
|
| 912 |
|
| 913 |
|
| 914 |
input_args_list = ['model_state']
|
| 915 |
+
inputs_kwargs_list = ['debug', 'save_dir', 'sanitize_bot_response', 'model_state0', 'is_low_mem',
|
| 916 |
'raise_generate_gpu_exceptions', 'chat_context', 'concurrency_count', 'lora_weights']
|
| 917 |
|
| 918 |
|
utils.py
CHANGED
|
@@ -96,6 +96,8 @@ def system_info():
|
|
| 96 |
for k, v in gpu_memory_frac_dict.items():
|
| 97 |
system[f'GPU_M/%s' % k] = v
|
| 98 |
|
|
|
|
|
|
|
| 99 |
return system
|
| 100 |
|
| 101 |
|
|
|
|
| 96 |
for k, v in gpu_memory_frac_dict.items():
|
| 97 |
system[f'GPU_M/%s' % k] = v
|
| 98 |
|
| 99 |
+
system['hash'] = get_githash()
|
| 100 |
+
|
| 101 |
return system
|
| 102 |
|
| 103 |
|