diff --git "a/gen.py" "b/gen.py" deleted file mode 100644--- "a/gen.py" +++ /dev/null @@ -1,3821 +0,0 @@ -import ast -import copy -import functools -import inspect -import queue -import sys -import os -import time -import traceback -import typing -import warnings -from datetime import datetime -import requests -from requests import ConnectTimeout, JSONDecodeError -from urllib3.exceptions import ConnectTimeoutError, MaxRetryError, ConnectionError -from requests.exceptions import ConnectionError as ConnectionError2 -from requests.exceptions import ReadTimeout as ReadTimeout2 - -if os.path.dirname(os.path.abspath(__file__)) not in sys.path: - sys.path.append(os.path.dirname(os.path.abspath(__file__))) - -os.environ['HF_HUB_DISABLE_TELEMETRY'] = '1' -os.environ['BITSANDBYTES_NOWELCOME'] = '1' -warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated') - -# more is not useful typically, don't let these go beyond limits and eat up resources -max_cores = max(1, os.cpu_count() // 2) -if os.getenv('NUMEXPR_MAX_THREADS') is None: - os.environ['NUMEXPR_MAX_THREADS'] = str(min(8, max_cores)) -if os.getenv('NUMEXPR_NUM_THREADS') is None: - os.environ['NUMEXPR_NUM_THREADS'] = str(min(8, max_cores)) -if os.getenv('OMP_NUM_THREADS') is None: - os.environ['OMP_NUM_THREADS'] = str(min(8, max_cores)) -if os.getenv('OPENBLAS_NUM_THREADS') is None: - os.environ['OPENBLAS_NUM_THREADS'] = str(min(8, max_cores)) -if os.getenv('DUCKDB_NUM_THREADS') is None: - os.environ['DUCKDB_NUM_THREADS'] = str(min(4, max_cores)) -if os.getenv('RAYON_RS_NUM_CPUS') is None: - os.environ['RAYON_RS_NUM_CPUS'] = str(min(8, max_cores)) -if os.getenv('RAYON_NUM_THREADS') is None: - os.environ['RAYON_NUM_THREADS'] = str(min(8, max_cores)) - -import numpy as np -from evaluate_params import eval_func_param_names, no_default_param_names, input_args_list -from enums import DocumentSubset, LangChainMode, no_lora_str, model_token_mapping, no_model_str, \ - LangChainAction, LangChainAgent, DocumentChoice, LangChainTypes, super_source_prefix, \ - super_source_postfix, t5_type, get_langchain_prompts, gr_to_lg, invalid_key_msg -from loaders import get_loaders -from utils import set_seed, clear_torch_cache, NullContext, wrapped_partial, EThread, get_githash, \ - import_matplotlib, get_device, makedirs, get_kwargs, start_faulthandler, get_hf_server, FakeTokenizer, \ - have_langchain, set_openai, cuda_vis_check, H2O_Fire, lg_to_gr, str_to_list, str_to_dict, get_token_count - -start_faulthandler() -import_matplotlib() - -SEED = 1236 -set_seed(SEED) - -from typing import Union - -import torch -from transformers import GenerationConfig, AutoModel, TextIteratorStreamer - -from prompter import Prompter, inv_prompt_type_to_model_lower, non_hf_types, PromptType, get_prompt, generate_prompt -from stopping import get_stopping - -langchain_actions = [x.value for x in list(LangChainAction)] - -langchain_agents_list = [x.value for x in list(LangChainAgent)] - - -def main( - load_8bit: bool = False, - load_4bit: bool = False, - low_bit_mode: int = 1, - load_half: bool = None, - load_gptq: str = '', - load_exllama: bool = False, - use_safetensors: bool = False, - revision: str = None, - use_gpu_id: bool = True, - base_model: str = '', - tokenizer_base_model: str = '', - lora_weights: str = "", - gpu_id: int = 0, - compile_model: bool = None, - use_cache: bool = None, - inference_server: str = "", - prompt_type: Union[int, str] = None, - prompt_dict: typing.Dict = None, - system_prompt: str = '', - - # llama and gpt4all settings - llamacpp_dict: typing.Dict = dict(n_gpu_layers=100, use_mlock=True, n_batch=1024, n_gqa=0), - model_path_llama: str = 'https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q8_0.bin', - # 'llama-2-7b-chat.ggmlv3.q8_0.bin', - model_name_gptj: str = 'ggml-gpt4all-j-v1.3-groovy.bin', - model_name_gpt4all_llama: str = 'ggml-wizardLM-7B.q4_2.bin', - model_name_exllama_if_no_config: str = 'TheBloke/Nous-Hermes-Llama2-GPTQ', - - model_lock: typing.List[typing.Dict[str, str]] = None, - model_lock_columns: int = None, - fail_if_cannot_connect: bool = False, - - # input to generation - temperature: float = None, - top_p: float = None, - top_k: int = None, - num_beams: int = None, - repetition_penalty: float = None, - num_return_sequences: int = None, - do_sample: bool = None, - max_new_tokens: int = None, - min_new_tokens: int = None, - early_stopping: Union[bool, str] = None, - max_time: float = None, - - memory_restriction_level: int = None, - debug: bool = False, - save_dir: str = None, - share: bool = False, - local_files_only: bool = False, - resume_download: bool = True, - use_auth_token: Union[str, bool] = False, - trust_remote_code: Union[str, bool] = True, - rope_scaling: dict = None, - max_seq_len: int = None, - offload_folder: str = "offline_folder", - - src_lang: str = "English", - tgt_lang: str = "Russian", - - prepare_offline_level: int = 0, - cli: bool = False, - cli_loop: bool = True, - gradio: bool = True, - gradio_offline_level: int = 0, - server_name: str = "0.0.0.0", - root_path: str = "", - chat: bool = True, - chat_conversation: typing.List[typing.Tuple[str, str]] = None, - text_context_list: typing.List[str] = None, - stream_output: bool = True, - async_output: bool = True, - num_async: int = 3, - show_examples: bool = None, - verbose: bool = False, - h2ocolors: bool = True, - dark: bool = False, # light tends to be best - height: int = 600, - show_lora: bool = True, - show_llama: bool = True, - show_gpt4all: bool = False, - login_mode_if_model0: bool = False, - block_gradio_exit: bool = True, - concurrency_count: int = 1, - api_open: bool = False, - allow_api: bool = True, - input_lines: int = 1, - gradio_size: str = None, - show_copy_button: bool = True, - large_file_count_mode: bool = False, - pre_load_embedding_model: bool = True, - - auth: Union[typing.List[typing.Tuple[str, str]], str] = None, - auth_filename: str = None, - auth_access: str = 'open', - auth_freeze: bool = False, - auth_message: str = None, - guest_name: str = "guest", - enforce_h2ogpt_api_key: bool = None, - h2ogpt_api_keys: Union[list, str] = [], - h2ogpt_key: str = None, - - max_max_time=None, - max_max_new_tokens=None, - - visible_models: list = None, - visible_visible_models: bool = True, - visible_submit_buttons: bool = True, - visible_side_bar: bool = True, - visible_doc_track: bool = True, - visible_chat_tab: bool = True, - visible_doc_selection_tab: bool = True, - visible_doc_view_tab: bool = True, - visible_chat_history_tab: bool = True, - visible_expert_tab: bool = True, - visible_models_tab: bool = True, - visible_system_tab: bool = True, - visible_tos_tab: bool = False, - visible_login_tab: bool = True, - visible_hosts_tab: bool = False, - chat_tables: bool = False, - visible_h2ogpt_header: bool = True, - max_raw_chunks: int = None, - - sanitize_user_prompt: bool = False, - sanitize_bot_response: bool = False, - - extra_model_options: typing.List[str] = [], - extra_lora_options: typing.List[str] = [], - extra_server_options: typing.List[str] = [], - - score_model: str = 'auto', - - eval_filename: str = None, - eval_prompts_only_num: int = 0, - eval_prompts_only_seed: int = 1234, - eval_as_output: bool = False, - - langchain_mode: str = None, - user_path: str = None, - langchain_modes: list = [LangChainMode.USER_DATA.value, LangChainMode.MY_DATA.value, LangChainMode.LLM.value, - LangChainMode.DISABLED.value], - langchain_mode_paths: dict = {LangChainMode.USER_DATA.value: None}, - langchain_mode_types: dict = {LangChainMode.USER_DATA.value: LangChainTypes.SHARED.value}, - detect_user_path_changes_every_query: bool = False, - - langchain_action: str = LangChainAction.QUERY.value, - langchain_agents: list = [], - force_langchain_evaluate: bool = False, - - visible_langchain_actions: list = [LangChainAction.QUERY.value, LangChainAction.SUMMARIZE_MAP.value], - visible_langchain_agents: list = langchain_agents_list.copy(), - - document_subset: str = DocumentSubset.Relevant.name, - document_choice: list = [DocumentChoice.ALL.value], - - use_llm_if_no_docs: bool = True, - load_db_if_exists: bool = True, - keep_sources_in_context: bool = False, - db_type: str = 'chroma', - use_openai_embedding: bool = False, - use_openai_model: bool = False, - hf_embedding_model: str = None, - migrate_embedding_model: str = False, - auto_migrate_db: bool = False, - cut_distance: float = 1.64, - answer_with_sources: bool = True, - append_sources_to_answer: bool = True, - show_accordions: bool = True, - top_k_docs_max_show: int = 10, - show_link_in_sources: bool = True, - pre_prompt_query: str = None, - prompt_query: str = None, - pre_prompt_summary: str = None, - prompt_summary: str = None, - add_chat_history_to_context: bool = True, - add_search_to_context: bool = False, - context: str = '', - iinput: str = '', - allow_upload_to_user_data: bool = True, - reload_langchain_state: bool = True, - allow_upload_to_my_data: bool = True, - enable_url_upload: bool = True, - enable_text_upload: bool = True, - enable_sources_list: bool = True, - chunk: bool = True, - chunk_size: int = 512, - top_k_docs: int = None, - docs_ordering_type: str = 'reverse_ucurve_sort', - min_max_new_tokens=256, - auto_reduce_chunks: bool = True, - max_chunks: int = 100, - headsize: int = 50, - n_jobs: int = -1, - - # urls - use_unstructured=True, - use_playwright=False, - use_selenium=False, - - # pdfs - use_pymupdf='auto', - use_unstructured_pdf='auto', - use_pypdf='auto', - enable_pdf_ocr='auto', - enable_pdf_doctr='auto', - try_pdf_as_html='auto', - - # images - enable_ocr=False, - enable_doctr=False, - enable_pix2struct=False, - enable_captions=True, - - pre_load_caption_model: bool = False, - caption_gpu: bool = True, - captions_model: str = "Salesforce/blip-image-captioning-base", - doctr_gpu: bool = True, - - # json - jq_schema='.[]', - - max_quality: bool = False, - - enable_heap_analytics: bool = True, - heap_app_id: str = "1680123994", -): - """ - - :param load_8bit: load model in 8-bit using bitsandbytes - :param load_4bit: load model in 4-bit using bitsandbytes - :param low_bit_mode: 0: no quantization config 1: change compute 2: nf4 3: double quant 4: 2 and 3 - See: https://huggingface.co/docs/transformers/main_classes/quantization - If using older bitsandbytes or transformers, 0 is required - :param load_half: load model in float16 (None means auto, which means True unless t5 based model) - otherwise specify bool - :param load_gptq: to load model with GPTQ, put model_basename here, e.g. gptq_model-4bit--1g - :param load_exllama: whether to use exllama (only applicable to LLaMa1/2 models with 16-bit or GPTQ - :param use_safetensors: to use safetensors version (assumes file/HF points to safe tensors version) - :param revision: Which HF revision to use - :param use_gpu_id: whether to control devices with gpu_id. If False, then spread across GPUs - :param base_model: model HF-type name. If use --base_model to preload model, cannot unload in gradio in models tab - :param tokenizer_base_model: tokenizer HF-type name. Usually not required, inferred from base_model. - :param lora_weights: LORA weights path/HF link - :param gpu_id: if use_gpu_id, then use gpu_id for cuda device ID, or auto mode if gpu_id != -1 - :param compile_model Whether to compile the model - :param use_cache: Whether to use caching in model (some models fail when multiple threads use) - :param inference_server: Consume base_model as type of model at this address - Address can be text-generation-server hosting that base_model - e.g. python generate.py --inference_server="http://192.168.1.46:6112" --base_model=h2oai/h2ogpt-oasst1-512-12b - - Or Address can be "openai_chat" or "openai" for OpenAI API - Or Address can be "openai_azure_chat" or "openai_azure" for Azure OpenAI API - e.g. python generate.py --inference_server="openai_chat" --base_model=gpt-3.5-turbo - e.g. python generate.py --inference_server="openai" --base_model=text-davinci-003 - e.g. python generate.py --inference_server="openai_azure_chat::::" --base_model=gpt-3.5-turbo - e.g. python generate.py --inference_server="openai_azure::::" --base_model=text-davinci-003 - Optionals (Replace with None or just leave empty but keep :) - of some deployment name - : e.g. ".openai.azure.com" for some without https:// - of some api, e.g. 2023-05-15 - e.g. 0613 - - Or Address can be for vLLM: - Use: "vllm:IP:port" for OpenAI-compliant vLLM endpoint - Note: vllm_chat not supported by vLLM project. - - Or Address can be replicate: - Use: - --inference_server=replicate: will use a Replicate server, requiring a Replicate key. - e.g. looks like "a16z-infra/llama13b-v2-chat:df7690f1994d94e96ad9d568eac121aecf50684a0b0963b25a41cc40061269e5" - - Or Address can be for AWS SageMaker: - Use: "sagemaker_chat:" for chat models that AWS sets up as dialog - Use: "sagemaker:" for foundation models that AWS only text as inputs - - :param prompt_type: type of prompt, usually matched to fine-tuned model or plain for foundational model - :param prompt_dict: If prompt_type=custom, then expects (some) items returned by get_prompt(..., return_dict=True) - :param system_prompt: Universal system prompt to use if model supports, like LLaMa2, regardless of prompt_type definition. - Useful for langchain case to control behavior, or OpenAI and Replicate. - If None, 'None', or 'auto', then for LLaMa or other models that internally have system_prompt, will use default for each model - If '', then no system prompt (no empty template given to model either, just no system part added at all) - If some string not in ['None', 'auto'], then use that as system prompt - Default is '', no system_prompt, because often it hurts performance/accuracy - - :param llamacpp_dict: - n_gpu_layers: for llama.cpp based models, number of GPU layers to offload (default is all by using large value) - use_mlock: when using `llama.cpp` based CPU models, for computers with low system RAM or slow CPUs, recommended False - n_batch: Can make smaller to 128 for slower low-memory CPU systems - n_gqa: Required to be 8 for LLaMa 70B - ... etc. anything that could be passed to llama.cpp or GPT4All models - e.g. python generate.py --base_model='llama' --prompt_type=llama2 --score_model=None --langchain_mode='UserData' --user_path=user_path --llamacpp_dict="{'n_gpu_layers':25,'n_batch':128}" - :param model_path_llama: model path or URL (for auto-download) - :param model_name_gptj: model path or URL (for auto-download) - :param model_name_gpt4all_llama: model path or URL (for auto-download) - :param model_name_exllama_if_no_config: exllama model's full path for model, tokenizer, generator for use when no HuggingFace config - - :param model_lock: Lock models to specific combinations, for ease of use and extending to many models - Only used if gradio = True - List of dicts, each dict has base_model, tokenizer_base_model, lora_weights, inference_server, prompt_type, and prompt_dict - If all models have same prompt_type, and prompt_dict, can still specify that once in CLI outside model_lock as default for dict - Can specify model_lock instead of those items on CLI - As with CLI itself, base_model can infer prompt_type and prompt_dict if in prompter.py. - Also, tokenizer_base_model and lora_weights are optional. - Also, inference_server is optional if loading model from local system. - All models provided will automatically appear in compare model mode - Model loading-unloading and related choices will be disabled. Model/lora/server adding will be disabled - :param model_lock_columns: How many columns to show if locking models (and so showing all at once) - If None, then defaults to up to 3 - if -1, then all goes into 1 row - Maximum value is 4 due to non-dynamic gradio rendering elements - :param fail_if_cannot_connect: if doing model locking (e.g. with many models), fail if True. Otherwise ignore. - Useful when many endpoints and want to just see what works, but still have to wait for timeout. - - :param temperature: generation temperature - :param top_p: generation top_p - :param top_k: generation top_k - :param num_beams: generation number of beams - :param repetition_penalty: generation repetition penalty - :param num_return_sequences: generation number of sequences (1 forced for chat) - :param do_sample: generation sample - :param max_new_tokens: generation max new tokens - :param min_new_tokens: generation min tokens - :param early_stopping: generation early stopping - :param max_time: maximum time to allow for generation - :param memory_restriction_level: 0 = no restriction to tokens or model, 1 = some restrictions on token 2 = HF like restriction 3 = very low memory case - :param debug: enable debug mode - :param save_dir: directory chat data is saved to - :param share: whether to share the gradio app with sharable URL - :param local_files_only: whether to only use local files instead of doing to HF for models - :param resume_download: whether to resume downloads from HF for models - :param use_auth_token: whether to use HF auth token (requires CLI did huggingface-cli login before) - :param trust_remote_code: whether to use trust any code needed for HF model - :param rope_scaling: - For HF transformers model: scaling for rope-based models, e.g. --rope_scaling="{'type':'dynamic', 'factor':4}" - For exllama model: --rope_scaling="{'alpha_value':4}" . This automatically scales max_seq_len for exllama - :param max_seq_len: Manually set maximum sequence length for the LLM - :param offload_folder: path for spilling model onto disk - :param src_lang: source languages to include if doing translation (None = all) - :param tgt_lang: target languages to include if doing translation (None = all) - - :param prepare_offline_level: - Whether to just prepare for offline use, do not go into cli, eval, or gradio run modes - 0 : no prep - 1: prepare just h2oGPT with exact same setup as passed to CLI and ensure all artifacts for h2oGPT alone added to ~/.cache/ - 2: prepare h2oGPT + all inference servers so h2oGPT+inference servers can use the ~/.cache/ - :param cli: whether to use CLI (non-gradio) interface. - :param cli_loop: whether to loop for CLI (False usually only for testing) - :param gradio: whether to enable gradio, or to enable benchmark mode - :param gradio_offline_level: > 0, then change fonts so full offline - == 1 means backend won't need internet for fonts, but front-end UI might if font not cached - == 2 means backend and frontend don't need internet to download any fonts. - Note: Some things always disabled include HF telemetry, gradio telemetry, chromadb posthog that involve uploading. - This option further disables google fonts for downloading, which is less intrusive than uploading, - but still required in air-gapped case. The fonts don't look as nice as google fonts, but ensure full offline behavior. - Also set --share=False to avoid sharing a gradio live link. - :param server_name: IP to use. In linux 0.0.0.0 is good choice so exposed to outside host, else for only local use 127.0.0.1. - For windows/MAC 0.0.0.0 or 127.0.0.1 will work, but may need to specify actual LAN IP address for other LAN clients to see. - :param root_path: The root path (or "mount point") of the application, - if it's not served from the root ("/") of the domain. Often used when the application is behind a reverse proxy - that forwards requests to the application. For example, if the application is served at "https://example.com/myapp", - the `root_path` should be set to "/myapp". - :param chat: whether to enable chat mode with chat history - :param chat_conversation: list of tuples of (human, bot) conversation pre-appended to existing chat when using instruct/chat models - Requires also add_chat_history_to_context = True - It does *not* require chat=True, so works with nochat_api etc. - :param text_context_list: List of strings to add to context for non-database version of document Q/A for faster handling via API etc. - Forces LangChain code path and uses as many entries in list as possible given max_seq_len, with first assumed to be most relevant and to go near prompt. - :param stream_output: whether to stream output - :param async_output: Whether to do asyncio handling - For summarization - Applicable to HF TGI server - Only if stream_output=False in CLI, UI, or API - :param num_async: Number of simultaneously allowed asyncio calls to make for async_output - Too many will overload inference server, too few will be too slow - :param show_examples: whether to show clickable examples in gradio - :param verbose: whether to show verbose prints - :param h2ocolors: whether to use H2O.ai theme - :param dark: whether to use dark mode for UI by default (still controlled in UI) - :param height: height of chat window - :param show_lora: whether to show LORA options in UI (expert so can be hard to understand) - :param show_llama: whether to show LLaMa.cpp/GPT4All options in UI (only likely useful if have weak GPUs) - :param show_gpt4all: whether to show GPT4All models in UI (not often useful, llama.cpp models best) - :param login_mode_if_model0: set to True to load --base_model after client logs in, to be able to free GPU memory when model is swapped - :param block_gradio_exit: whether to block gradio exit (used for testing) - :param concurrency_count: gradio concurrency count (1 is optimal for LLMs) - :param api_open: If False, don't let API calls skip gradio queue - :param allow_api: whether to allow API calls at all to gradio server - :param input_lines: how many input lines to show for chat box (>1 forces shift-enter for submit, else enter is submit) - :param gradio_size: Overall size of text and spaces: "xsmall", "small", "medium", "large". - Small useful for many chatbots in model_lock mode - :param show_copy_button: Whether to show copy button for chatbots - :param large_file_count_mode: Whether to force manual update to UI of drop-downs, good idea if millions of chunks or documents - :param pre_load_embedding_model: Whether to preload embedding model for shared use across DBs and users (multi-thread safe only) - - :param auth: gradio auth for launcher in form [(user1, pass1), (user2, pass2), ...] - e.g. --auth=[('jon','password')] with no spaces - e.g. --auth="[('jon', 'password)())(')]" so any special characters can be used - e.g. --auth=auth.json to specify persisted state file with name auth.json (auth_filename then not required) - e.g. --auth='' will use default auth.json as file name for persisted state file (auth_filename then not required) - e.g. --auth=None will use no auth, but still keep track of auth state, just not from logins - :param auth_filename: - Set auth filename, used only if --auth= was passed list of user/passwords - :param auth_access: - 'open': Allow new users to be added - 'closed': Stick to existing users - :param auth_freeze: whether freeze authentication based upon current file, no longer update file - :param auth_message: Message to show if having users login, fixed if passed, else dynamic internally - :param guest_name: guess name if using auth and have open access. - If '', then no guest allowed even if open access, then all databases for each user always persisted - :param enforce_h2ogpt_api_key: Whether to enforce h2oGPT token usage for API - :param h2ogpt_api_keys: list of tokens allowed for API access or file accessed on demand for json of list of keys - :param h2ogpt_key: E.g. can be set when accessing gradio h2oGPT server from local gradio h2oGPT server that acts as client to that inference server - - :param max_max_time: Maximum max_time for gradio slider - :param max_max_new_tokens: Maximum max_new_tokens for gradio slider - :param min_max_new_tokens: Minimum of max_new_tokens, when auto-scaling down to handle more docs/prompt, but still let generation have some tokens - - :param visible_models: Which models in model_lock list to show by default - Takes integers of position in model_lock (model_states) list or strings of base_model names - Ignored if model_lock not used - For nochat API, this is single item within a list for model by name or by index in model_lock - If None, then just use first model in model_lock list - If model_lock not set, use model selected by CLI --base_model etc. - - :param visible_visible_models: Whether visible models drop-down is visible in UI - :param visible_submit_buttons: whether submit buttons are visible when UI first comes up - :param visible_side_bar: whether left side bar is visible when UI first comes up - :param visible_doc_track: whether left side bar's document tracking is visible when UI first comes up - :param visible_chat_tab: "" for chat tab - :param visible_doc_selection_tab: "" for doc selection tab - :param visible_doc_view_tab: "" for doc view tab - :param visible_chat_history_tab: "" for chat history tab - :param visible_expert_tab: "" for expert tab - :param visible_models_tab: "" for models tab - :param visible_system_tab: "" for system tab - :param visible_tos_tab: "" for ToS tab - :param visible_login_tab: "" for Login tab - :param visible_hosts_tab: "" for hosts tab - :param chat_tables: Just show Chat as block without tab (useful if want only chat view) - :param visible_h2ogpt_header: Whether github stars, URL, logo, and QR code are visible - :param max_raw_chunks: Maximum number of chunks to show in UI when asking for raw DB text from documents/collection - - :param sanitize_user_prompt: whether to remove profanity from user input (slows down input processing) - Requires optional packages: - pip install alt-profanity-check==1.2.2 better-profanity==0.7.0 - :param sanitize_bot_response: whether to remove profanity and repeat lines from bot output (about 2x slower generation for long streaming cases due to better_profanity being slow) - :param extra_model_options: extra models to show in list in gradio - :param extra_lora_options: extra LORA to show in list in gradio - :param extra_server_options: extra servers to show in list in gradio - :param score_model: which model to score responses - None: no response scoring - 'auto': auto mode, '' (no model) for CPU or 1 GPU, 'OpenAssistant/reward-model-deberta-v3-large-v2' for >=2 GPUs, - because on CPU takes too much compute just for scoring response - :param eval_filename: json file to use for evaluation, if None is sharegpt - :param eval_prompts_only_num: for no gradio benchmark, if using eval_filename prompts for eval instead of examples - :param eval_prompts_only_seed: for no gradio benchmark, seed for eval_filename sampling - :param eval_as_output: for no gradio benchmark, whether to test eval_filename output itself - - :param langchain_mode: Data source to include. Choose "UserData" to only consume files from make_db.py. - None: auto mode, check if langchain package exists, at least do LLM if so, else Disabled - If not passed, then chosen to be first langchain_modes, else langchain_mode->Disabled is set if no langchain_modes either - WARNING: wiki_full requires extra data processing via read_wiki_full.py and requires really good workstation to generate db, unless already present. - :param user_path: user path to glob from to generate db for vector search, for 'UserData' langchain mode. - If already have db, any new/changed files are added automatically if path set, does not have to be same path used for prior db sources - :param langchain_modes: dbs to generate at launch to be ready for LLM - Apart from additional user-defined collections, can include ['wiki', 'wiki_full', 'UserData', 'MyData', 'github h2oGPT', 'DriverlessAI docs'] - But wiki_full is expensive and requires preparation - To allow personal space only live in session, add 'MyData' to list - Default: If only want to consume local files, e.g. prepared by make_db.py, only include ['UserData'] - If have own user modes, need to add these here or add in UI. - :param langchain_mode_paths: dict of langchain_mode keys and disk path values to use for source of documents - E.g. "{'UserData2': 'userpath2'}" - A disk path be None, e.g. --langchain_mode_paths="{'UserData2': None}" even if existing DB, to avoid new documents being added from that path, source links that are on disk still work. - If `--user_path` was passed, that path is used for 'UserData' instead of the value in this dict - :param langchain_mode_types: dict of langchain_mode keys and database types - E.g. python generate.py --base_model=llama --langchain_modes=['TestData'] --langchain_mode_types="{'TestData':'shared'}" - The type is attempted to be inferred if directory already exists, then don't have to pass this - :param detect_user_path_changes_every_query: whether to detect if any files changed or added every similarity search (by file hashes). - Expensive for large number of files, so not done by default. By default only detect changes during db loading. - - :param langchain_action: Mode langchain operations in on documents. - Query: Make query of document(s) - Summarize or Summarize_map_reduce: Summarize document(s) via map_reduce - Summarize_all: Summarize document(s) using entire document at once - Summarize_refine: Summarize document(s) using entire document, and try to refine before returning summary - :param langchain_agents: Which agents to use - 'search': Use Web Search as context for LLM response, e.g. SERP if have SERPAPI_API_KEY in env - :param force_langchain_evaluate: Whether to force langchain LLM use even if not doing langchain, mostly for testing. - - :param visible_langchain_actions: Which actions to allow - :param visible_langchain_agents: Which agents to allow - - :param document_subset: Default document choice when taking subset of collection - :param document_choice: Chosen document(s) by internal name, 'All' means use all docs - - :param use_llm_if_no_docs: Whether to use LLM even if no documents, when langchain_mode=UserData or MyData or custom - :param load_db_if_exists: Whether to load chroma db if exists or re-generate db - :param keep_sources_in_context: Whether to keep url sources in context, not helpful usually - :param db_type: 'faiss' for in-memory - 'chroma' (for chroma >= 0.4) - 'chroma_old' (for chroma < 0.4) -- recommended for large collections - 'weaviate' for persisted on disk - :param use_openai_embedding: Whether to use OpenAI embeddings for vector db - :param use_openai_model: Whether to use OpenAI model for use with vector db - :param hf_embedding_model: Which HF embedding model to use for vector db - Default is instructor-large with 768 parameters per embedding if have GPUs, else all-MiniLM-L6-v2 if no GPUs - Can also choose simpler model with 384 parameters per embedding: "sentence-transformers/all-MiniLM-L6-v2" - Can also choose even better embedding with 1024 parameters: 'hkunlp/instructor-xl' - We support automatically changing of embeddings for chroma, with a backup of db made if this is done - :param migrate_embedding_model: whether to use hf_embedding_model embedding even if database already had an embedding set. - used to migrate all embeddings to a new one, but will take time to re-embed. - Default (False) is to use the prior embedding for existing databases, and only use hf_embedding_model for new databases - If had old database without embedding saved, then hf_embedding_model is also used. - :param auto_migrate_db: whether to automatically migrate any chroma<0.4 database from duckdb -> sqlite version - :param cut_distance: Distance to cut off references with larger distances when showing references. - 1.64 is good to avoid dropping references for all-MiniLM-L6-v2, but instructor-large will always show excessive references. - For all-MiniLM-L6-v2, a value of 1.5 can push out even more references, or a large value of 100 can avoid any loss of references. - :param answer_with_sources: Whether to determine (and return) sources - :param append_sources_to_answer: Whether to place source information in chat response (ignored by LLM). Always disabled for API. - :param show_accordions: whether to show accordion for document references in chatbot UI - :param top_k_docs_max_show: Max number of docs to show in UI for sources - If web search is enabled, then this is modified to be max(top_k_docs_max_show, number of links used in search) - :param show_link_in_sources: Whether to show URL link to source document in references - :param pre_prompt_query: prompt before documents to query, if None then use internal defaults - :param prompt_query: prompt after documents to query, if None then use internal defaults - :param pre_prompt_summary: prompt before documents to summarize, if None then use internal defaults - :param prompt_summary: prompt after documents to summarize, if None then use internal defaults - For summarize, normal to have empty query (nothing added in ask anything in UI or empty string in API) - If pass query, template is "Focusing on %s, %s" % (query, prompt_summary) - If pass query and iinput, template is "Focusing on %s, %s, %s" % (query, iinput, prompt_summary) - :param add_chat_history_to_context: Include chat context when performing action - Not supported yet for openai_chat when using document collection instead of LLM - Also not supported when using CLI mode - :param add_search_to_context: Include web search in context as augmented prompt - :param context: Default context to use (for system pre-context in gradio UI) - context comes before chat_conversation and any document Q/A from text_context_list - :param iinput: Default input for instruction-based prompts - :param allow_upload_to_user_data: Whether to allow file uploads to update shared vector db (UserData or custom user dbs) - Ensure pass user_path for the files uploaded to be moved to this location for linking. - :param reload_langchain_state: Whether to reload langchain_modes.pkl file that contains any new user collections. - :param allow_upload_to_my_data: Whether to allow file uploads to update personal vector db - :param enable_url_upload: Whether to allow upload from URL - :param enable_text_upload: Whether to allow upload of text - :param enable_sources_list: Whether to allow list (or download for non-shared db) of list of sources for chosen db - :param chunk: Whether to chunk data (True unless know data is already optimally chunked) - :param chunk_size: Size of chunks, with typically top-4 passed to LLM, so needs to be in context length - :param top_k_docs: For langchain_action query: number of chunks to give LLM - -1 : auto-fills context up to max_seq_len - For langchain_action summarize: number of document parts, like pages for PDF. - There's no such thing as chunks for summarization. - -1 : auto-fills context up to max_seq_len - :param docs_ordering_type: - Type of ordering of docs. - 'best_first': Order by score so score is worst match near prompt - 'best_near_prompt' or 'reverse_sort' : reverse docs order so most relevant is closest to question. - Best choice for sufficiently smart model, and truncation occurs for oldest context, so best then too. - But smaller 6_9 models fail to use newest context and can get stuck on old information. - '' or None (i.e. default) or 'reverse_ucurve_sort' : Sort so most relevant is either near start or near end - Best to avoid "lost in middle" as well as avoid hallucinating off starting content that LLM focuses on alot. - :param auto_reduce_chunks: Whether to automatically reduce top_k_docs to fit context given prompt - :param max_chunks: If top_k_docs=-1, maximum number of chunks to allow - :param headsize: Maximum number of characters for head of document document for UI to show - :param n_jobs: Number of processors to use when consuming documents (-1 = all, is default) - - :param use_unstructured: Enable unstructured URL loader - :param use_playwright: Enable PlayWright URL loader - :param use_selenium: Enable Selenium URL loader - - :param use_pymupdf: enable PyMUPDF 'auto' means use first, use others if they are 'auto' if no result - :param use_unstructured_pdf: enable Unstructured PDF loader, 'auto' means use if pymupdf fails to get doc result - :param use_pypdf: enable PyPDF loader 'auto' means use if unstructured fails to get doc result - :param enable_pdf_ocr: 'auto' means only use OCR if normal text extraction fails. Useful for pure image-based PDFs with text. - if enable_pdf_doctr == 'on' then don't do. - 'on' means always do OCR as additional parsing of same documents - 'off' means don't do OCR (e.g. because it's slow even if 'auto' only would trigger if nothing else worked) - :param enable_pdf_doctr: Whether to support doctr on pdfs, 'auto' means use do if failed to get doc result so far - :param try_pdf_as_html: Try "PDF" as if HTML file, in case web link has .pdf extension but really is just HTML - - :param enable_ocr: Whether to support OCR on images - :param enable_doctr: Whether to support doctr on images (using OCR better than enable_ocr=True) - :param enable_pix2struct: Whether to support pix2struct on images for captions - :param enable_captions: Whether to support captions using BLIP for image files as documents, - then preloads that model if pre_load_caption_model=True - - :param pre_load_caption_model: Whether to preload caption model, or load after forking parallel doc loader - parallel loading disabled if preload and have images, to prevent deadlocking on cuda context - Recommended if using larger caption model - :param captions_model: Which model to use for captions. - captions_model: str = "Salesforce/blip-image-captioning-base", # continue capable - captions_model: str = "Salesforce/blip2-flan-t5-xl", # question/answer capable, 16GB state - captions_model: str = "Salesforce/blip2-flan-t5-xxl", # question/answer capable, 60GB state - Note: opt-based blip2 are not permissive license due to opt and Meta license restrictions - Disabled for CPU since BLIP requires CUDA - :param caption_gpu: If support caption, then use GPU if exists - - :param doctr_gpu: If support doctr, then use GPU if exists - - :param jq_schema: control json loader - By default '.[]' ingests everything in brute-force way, but better to match your schema - See: https://python.langchain.com/docs/modules/data_connection/document_loaders/json#using-jsonloader - - :param max_quality: Choose maximum quality ingestion with all available parsers - Pro: Catches document when some default parsers would fail - Pro: Enables DocTR that has much better OCR than Tesseract - Con: Fills DB with results from all parsers, so similarity search gives redundant results - - :param enable_heap_analytics: Toggle telemetry. - :param heap_app_id: App ID for Heap, change to your ID. - :return: - """ - if base_model is None: - base_model = '' - if tokenizer_base_model is None: - tokenizer_base_model = '' - if lora_weights is None: - lora_weights = '' - if inference_server is None: - inference_server = '' - - # listen to env if set - model_lock = os.getenv('model_lock', str(model_lock)) - model_lock = ast.literal_eval(model_lock) - - chat_conversation = str_to_list(chat_conversation) - text_context_list = str_to_list(text_context_list) - - llamacpp_dict = str_to_dict(llamacpp_dict) - # add others to single dict - llamacpp_dict['model_path_llama'] = model_path_llama - llamacpp_dict['model_name_gptj'] = model_name_gptj - llamacpp_dict['model_name_gpt4all_llama'] = model_name_gpt4all_llama - llamacpp_dict['model_name_exllama_if_no_config'] = model_name_exllama_if_no_config - # if user overrides but doesn't set these: - if 'n_batch' not in llamacpp_dict: - llamacpp_dict['n_batch'] = 128 - if 'n_gpu_layers' not in llamacpp_dict: - llamacpp_dict['n_gpu_layers'] = 100 - if 'n_gqa' not in llamacpp_dict: - llamacpp_dict['n_gqa'] = 0 - - if os.environ.get('SERPAPI_API_KEY') is None and LangChainAgent.SEARCH.value in visible_langchain_agents: - visible_langchain_agents.remove(LangChainAgent.SEARCH.value) - - if model_lock: - assert gradio, "model_lock only supported for gradio=True" - assert not cli, "model_lock only supported for cli=False" - assert not (not cli and not gradio), "model_lock only supported for eval (cli=gradio=False)" - assert not base_model, "Don't specify model_lock and base_model" - assert not tokenizer_base_model, "Don't specify model_lock and tokenizer_base_model" - assert not lora_weights, "Don't specify model_lock and lora_weights" - assert not inference_server, "Don't specify model_lock and inference_server" - # assert not prompt_type, "Don't specify model_lock and prompt_type" - # assert not prompt_dict, "Don't specify model_lock and prompt_dict" - - n_jobs = int(os.getenv('n_jobs', str(n_jobs))) - is_hf = bool(int(os.getenv("HUGGINGFACE_SPACES", '0'))) - is_gpth2oai = bool(int(os.getenv("GPT_H2O_AI", '0'))) - is_public = is_hf or is_gpth2oai # multi-user case with fixed model and disclaimer - if is_public: - visible_tos_tab = visible_hosts_tab = True - if enforce_h2ogpt_api_key is None: - enforce_h2ogpt_api_key = True - else: - if enforce_h2ogpt_api_key is None: - enforce_h2ogpt_api_key = False - if isinstance(h2ogpt_api_keys, str) and not os.path.isfile(h2ogpt_api_keys): - h2ogpt_api_keys = str_to_list(h2ogpt_api_keys) - if memory_restriction_level is None: - memory_restriction_level = 2 if is_hf else 0 # 2 assumes run on 24GB consumer GPU - else: - assert 0 <= memory_restriction_level <= 3, "Bad memory_restriction_level=%s" % memory_restriction_level - if n_jobs == -1: - # if -1, assume hypercores, don't use, force user to pass n_jobs to be specific if not standard cores - n_jobs = max(1, os.cpu_count() // 2) - if is_public and os.getenv('n_jobs') is None: - n_jobs = min(n_jobs, max(1, min(os.cpu_count() // 2, 8))) - admin_pass = os.getenv("ADMIN_PASS") - # will sometimes appear in UI or sometimes actual generation, but maybe better than empty result - # but becomes unrecoverable sometimes if raise, so just be silent for now - raise_generate_gpu_exceptions = True - - rope_scaling = str_to_dict(rope_scaling) - - if isinstance(auth, str): - if auth.strip().startswith('['): - auth = str_to_list(auth) - if isinstance(auth, str) and auth: - auth_filename = auth - if not auth_filename: - auth_filename = "auth.json" - assert isinstance(auth, (str, list, tuple, type(None))), "Unknown type %s for auth=%s" % (type(auth), auth) - - # allow set token directly - use_auth_token = os.environ.get("HUGGING_FACE_HUB_TOKEN", use_auth_token) - allow_upload_to_user_data = bool( - int(os.environ.get("allow_upload_to_user_data", str(int(allow_upload_to_user_data))))) - allow_upload_to_my_data = bool(int(os.environ.get("allow_upload_to_my_data", str(int(allow_upload_to_my_data))))) - height = int(os.environ.get("HEIGHT", height)) - h2ocolors = bool(int(os.getenv('h2ocolors', h2ocolors))) - - # allow enabling langchain via ENV - # FIRST PLACE where LangChain referenced, but no imports related to it - langchain_modes = ast.literal_eval(os.environ.get("langchain_modes", str(langchain_modes))) - if not isinstance(langchain_modes, list): - langchain_modes = [] - # always allow DISABLED - if LangChainMode.DISABLED.value not in langchain_modes: - langchain_modes.append(LangChainMode.DISABLED.value) - - # update - langchain_mode_paths = str_to_dict(langchain_mode_paths) - langchain_mode_types = str_to_dict(langchain_mode_types) - for lmode in [LangChainMode.GITHUB_H2OGPT.value, - LangChainMode.H2O_DAI_DOCS.value, - LangChainMode.WIKI.value, - LangChainMode.WIKI_FULL.value, - ]: - if lmode not in langchain_mode_types: - langchain_mode_types[lmode] = 'shared' - if lmode not in langchain_mode_paths: - langchain_mode_types[lmode] = '' - if user_path: - user_path = makedirs(user_path, use_base=True) - langchain_mode_paths['UserData'] = user_path - langchain_mode_paths['UserData'] = LangChainTypes.SHARED.value - - if is_public: - allow_upload_to_user_data = False - if LangChainMode.USER_DATA.value in langchain_modes: - langchain_modes.remove(LangChainMode.USER_DATA.value) - if max_raw_chunks is None: - max_raw_chunks = 30 if is_public else 1000000 - - # in-place, for non-scratch dbs - if allow_upload_to_user_data: - # always listen to CLI-passed user_path if passed - if user_path: - langchain_mode_paths['UserData'] = user_path - - assert langchain_action in langchain_actions, "Invalid langchain_action %s not in %s" % ( - langchain_action, langchain_actions) - assert len( - set(langchain_agents).difference(langchain_agents_list)) == 0, "Invalid langchain_agents %s" % langchain_agents - - # auto-set langchain_mode - langchain_mode = os.environ.get("LANGCHAIN_MODE", langchain_mode) - if have_langchain and langchain_mode is None: - # start in chat mode, in case just want to chat and don't want to get "No documents to query" by default. - if LangChainMode.LLM.value in langchain_modes: - langchain_mode = LangChainMode.LLM.value - elif len(langchain_modes) >= 1: - # infer even if don't pass which langchain_mode, just langchain_modes. - langchain_mode = langchain_modes[0] - if allow_upload_to_user_data and not is_public and langchain_mode_paths['UserData']: - if verbose: - print("Auto set langchain_mode=%s. Could use UserData instead." % langchain_mode, flush=True) - elif allow_upload_to_my_data: - if verbose: - print("Auto set langchain_mode=%s. Could use MyData instead." - " To allow UserData to pull files from disk," - " set user_path or langchain_mode_paths, and ensure allow_upload_to_user_data=True" % langchain_mode, - flush=True) - else: - raise RuntimeError("Please pass --langchain_mode= out of %s" % langchain_modes) - if not have_langchain and langchain_mode not in [None, LangChainMode.DISABLED.value, LangChainMode.LLM.value]: - raise RuntimeError("Asked for LangChain mode but langchain python package cannot be found.") - if langchain_mode is None: - # if not set yet, disable - langchain_mode = LangChainMode.DISABLED.value - print("Auto set langchain_mode=%s Have langchain package: %s" % (langchain_mode, have_langchain), flush=True) - # go ahead and add - if langchain_mode not in langchain_modes: - langchain_modes.append(langchain_mode) - - if is_public: - allow_upload_to_user_data = False - input_lines = 1 # ensure set, for ease of use - temperature = 0.2 if temperature is None else temperature - top_p = 0.85 if top_p is None else top_p - top_k = 70 if top_k is None else top_k - if is_hf: - do_sample = True if do_sample is None else do_sample - top_k_docs = 3 if top_k_docs is None else top_k_docs - else: - # by default don't sample, too chatty - do_sample = False if do_sample is None else do_sample - top_k_docs = 4 if top_k_docs is None else top_k_docs - - if memory_restriction_level == 2: - if not base_model and not inference_server and not model_lock: - base_model = 'h2oai/h2ogpt-oasst1-512-12b' - # don't set load_8bit if passed base_model, doesn't always work so can't just override - load_8bit = True - load_4bit = False # FIXME - consider using 4-bit instead of 8-bit - elif not inference_server: - top_k_docs = 10 if top_k_docs is None else top_k_docs - if memory_restriction_level >= 2: - load_8bit = True - load_4bit = False # FIXME - consider using 4-bit instead of 8-bit - if hf_embedding_model is None: - hf_embedding_model = "sentence-transformers/all-MiniLM-L6-v2" - top_k_docs = 3 if top_k_docs is None else top_k_docs - if top_k_docs is None: - top_k_docs = 3 - if is_public: - if not max_time: - max_time = 60 * 2 - if not max_max_time: - max_max_time = max_time - if not max_new_tokens: - max_new_tokens = 256 - if not max_max_new_tokens: - max_max_new_tokens = 512 - else: - if not max_max_time: - max_max_time = 60 * 20 - if not max_max_new_tokens: - max_max_new_tokens = 1024 - if is_hf: - # must override share if in spaces - share = False - if not max_time: - max_time = 60 * 1 - if not max_max_time: - max_max_time = max_time - # HF accounted for later in get_max_max_new_tokens() - save_dir = os.getenv('SAVE_DIR', save_dir) - save_dir = makedirs(save_dir, exist_ok=True, tmp_ok=True, use_base=True) - score_model = os.getenv('SCORE_MODEL', score_model) - if str(score_model) == 'None': - score_model = '' - concurrency_count = int(os.getenv('CONCURRENCY_COUNT', concurrency_count)) - api_open = bool(int(os.getenv('API_OPEN', str(int(api_open))))) - allow_api = bool(int(os.getenv('ALLOW_API', str(int(allow_api))))) - - n_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 0 - n_gpus, gpu_ids = cuda_vis_check(n_gpus) - - if load_half is None and t5_type(base_model): - load_half = False - print("load_half=%s auto-set for %s to avoid bad generation" % (load_half, base_model), flush=True) - - if n_gpus == 0 or get_device() == "mps": - # No CUDA GPUs usable - - if get_device() != "mps": - print("No GPUs detected", flush=True) - - enable_captions = False - gpu_id = None - load_8bit = False - load_4bit = False - low_bit_mode = 1 - if load_half is None: - # wouldn't work if specified True, but respect - load_half = False - load_gptq = '' - load_exllama = False - use_gpu_id = False - if get_device() == "cuda": - torch.backends.cudnn.benchmark = True - torch.backends.cudnn.enabled = False - torch.set_default_dtype(torch.float32) - if is_public and not inference_server and not model_lock: - # 12B uses ~94GB - # 6.9B uses ~47GB - base_model = 'h2oai/h2ogpt-oig-oasst1-512-6_9b' if not base_model else base_model - if hf_embedding_model is None: - # if no GPUs, use simpler embedding model to avoid cost in time - hf_embedding_model = "sentence-transformers/all-MiniLM-L6-v2" - if score_model == 'auto': - score_model = '' - else: - if load_half is None: - load_half = True - # CUDA GPUs visible - if score_model == 'auto': - if n_gpus >= 2: - # will by default place scoring model on last GPU - score_model = 'OpenAssistant/reward-model-deberta-v3-large-v2' - else: - score_model = '' - if hf_embedding_model is None: - # if still None, then set default - hf_embedding_model = 'hkunlp/instructor-large' - - # get defaults - if base_model: - model_lower = base_model.lower() - elif model_lock: - # have 0th model be thought of as normal model - assert len(model_lock) > 0 and model_lock[0]['base_model'] - model_lower = model_lock[0]['base_model'].lower() - else: - model_lower = '' - if not gradio: - # force, else not single response like want to look at - stream_output = False - # else prompt removal can mess up output - chat = False - # hard-coded defaults - first_para = False - text_limit = None - - if compile_model is None: - # too avoid noisy CLI - compile_model = not cli - - if offload_folder: - offload_folder = makedirs(offload_folder, exist_ok=True, tmp_ok=True, use_base=True) - - # defaults - caption_loader = None - doctr_loader = None - pix2struct_loader = None - - image_loaders_options0, image_loaders_options, \ - pdf_loaders_options0, pdf_loaders_options, \ - url_loaders_options0, url_loaders_options = lg_to_gr(**locals()) - jq_schema0 = jq_schema - # transcribe - image_loaders = image_loaders_options0 - pdf_loaders = pdf_loaders_options0 - url_loaders = url_loaders_options0 - - placeholder_instruction, placeholder_input, \ - stream_output, show_examples, \ - prompt_type, prompt_dict, \ - temperature, top_p, top_k, num_beams, \ - max_new_tokens, min_new_tokens, early_stopping, max_time, \ - repetition_penalty, num_return_sequences, \ - do_sample, \ - src_lang, tgt_lang, \ - examples, \ - task_info = \ - get_generate_params(model_lower, - chat, - stream_output, show_examples, - prompt_type, prompt_dict, - system_prompt, - pre_prompt_query, prompt_query, - pre_prompt_summary, prompt_summary, - temperature, top_p, top_k, num_beams, - max_new_tokens, min_new_tokens, early_stopping, max_time, - repetition_penalty, num_return_sequences, - do_sample, - top_k_docs, - chunk, - chunk_size, - image_loaders, - pdf_loaders, - url_loaders, - jq_schema, - docs_ordering_type, - min_max_new_tokens, - verbose, - ) - - git_hash = get_githash() if is_public or os.getenv('GET_GITHASH') else "GET_GITHASH" - locals_dict = locals() - locals_print = '\n'.join(['%s: %s' % (k, v) for k, v in locals_dict.items()]) - if verbose: - print(f"Generating model with params:\n{locals_print}", flush=True) - print("Command: %s\nHash: %s" % (str(' '.join(sys.argv)), git_hash), flush=True) - - if langchain_mode != "Disabled": - # SECOND PLACE where LangChain referenced, but all imports are kept local so not required - from gpt_langchain import prep_langchain, get_some_dbs_from_hf, get_persist_directory - if is_hf: - get_some_dbs_from_hf() - dbs = {} - for langchain_mode1 in langchain_modes: - langchain_type = langchain_mode_types.get(langchain_mode1, LangChainTypes.EITHER.value) - if langchain_type == LangChainTypes.PERSONAL.value: - # shouldn't prepare per-user databases here - continue - persist_directory1, langchain_type = get_persist_directory(langchain_mode1, langchain_type=langchain_type) - langchain_mode_types[langchain_mode1] = langchain_type - if langchain_type == LangChainTypes.PERSONAL.value: - # shouldn't prepare per-user databases here - continue - try: - db = prep_langchain(persist_directory1, - load_db_if_exists, - db_type, use_openai_embedding, - langchain_mode1, langchain_mode_paths, langchain_mode_types, - hf_embedding_model, - migrate_embedding_model, - auto_migrate_db, - kwargs_make_db=locals(), - verbose=verbose) - finally: - # in case updated embeddings or created new embeddings - clear_torch_cache() - dbs[langchain_mode1] = db - # remove None db's so can just rely upon k in dbs for if hav db - dbs = {k: v for k, v in dbs.items() if v is not None} - else: - dbs = {} - # import control - if os.environ.get("TEST_LANGCHAIN_IMPORT"): - assert 'gpt_langchain' not in sys.modules, "Dev bug, import of langchain when should not have" - assert 'langchain' not in sys.modules, "Dev bug, import of langchain when should not have" - - other_model_state_defaults = dict(load_8bit=load_8bit, load_4bit=load_4bit, low_bit_mode=low_bit_mode, - load_half=load_half, - load_gptq=load_gptq, load_exllama=load_exllama, use_safetensors=use_safetensors, - revision=revision, use_gpu_id=use_gpu_id, gpu_id=gpu_id, - compile_model=compile_model, - use_cache=use_cache, - llamacpp_dict=llamacpp_dict, model_path_llama=model_path_llama, - model_name_gptj=model_name_gptj, - model_name_gpt4all_llama=model_name_gpt4all_llama, - model_name_exllama_if_no_config=model_name_exllama_if_no_config, - ) - model_state_none = dict(model=None, tokenizer=None, device=None, - base_model=None, tokenizer_base_model=None, lora_weights=None, - inference_server=None, prompt_type=None, prompt_dict=None, - visible_models=None, h2ogpt_key=None, - ) - model_state_none.update(other_model_state_defaults) - my_db_state0 = {LangChainMode.MY_DATA.value: [None, None, None]} - selection_docs_state0 = dict(langchain_modes=langchain_modes, - langchain_mode_paths=langchain_mode_paths, - langchain_mode_types=langchain_mode_types) - selection_docs_state = copy.deepcopy(selection_docs_state0) - - if cli or not gradio: - # initial state for query prompt - model_name = base_model - pre_prompt_query, prompt_query, pre_prompt_summary, prompt_summary = \ - get_langchain_prompts(pre_prompt_query, prompt_query, - pre_prompt_summary, prompt_summary, - model_name, inference_server, - model_path_llama) - - if cli: - from cli import run_cli - return run_cli(**get_kwargs(run_cli, exclude_names=['model_state0'], **locals())) - elif not gradio: - from eval import run_eval - return run_eval(**get_kwargs(run_eval, exclude_names=['model_state0'], **locals())) - elif gradio or prepare_offline_level > 0: - # imported here so don't require gradio to run generate - from gradio_runner import go_gradio - - # get default model - model_states = [] - model_list = [dict(base_model=base_model, tokenizer_base_model=tokenizer_base_model, lora_weights=lora_weights, - inference_server=inference_server, prompt_type=prompt_type, prompt_dict=prompt_dict, - visible_models=None, h2ogpt_key=None)] - model_list[0].update(other_model_state_defaults) - # FIXME: hyper per model, not about model loading - # for k in gen_hyper: - # model_list[k] = locals()[k] - - model_list0 = copy.deepcopy(model_list) # just strings, safe to deepcopy - model_state0 = model_state_none.copy() - assert len(model_state_none) == len(model_state0) - if model_lock: - model_list = model_lock - # do reverse, so first is default base_model etc., so some logic works in go_gradio() more easily - for model_dict in reversed(model_list): - # handle defaults user didn't have to pass - # special defaults, ignore defaults for these if not specifically set, replace with '' - model_dict['base_model'] = model_dict.get('base_model', '') - model_dict['tokenizer_base_model'] = model_dict.get('tokenizer_base_model', '') - model_dict['lora_weights'] = model_dict.get('lora_weights', '') - model_dict['inference_server'] = model_dict.get('inference_server', '') - if prepare_offline_level >= 2: - if 'openai' not in model_dict['inference_server'] and 'replicate' not in model_dict['inference_server']: - # assume want locally, but OpenAI and replicate are never local for model part - model_dict['inference_server'] = '' - prompt_type_infer = not model_dict.get('prompt_type') - model_dict['prompt_type'] = model_dict.get('prompt_type', - model_list0[0]['prompt_type']) # don't use mutated value - # rest of generic defaults - for k in model_list0[0]: - if k not in model_dict: - model_dict[k] = model_list0[0][k] - - # begin prompt adjustments - # get query prompt for (say) last base model if using model lock - pre_prompt_query1, prompt_query1, pre_prompt_summary1, prompt_summary1 = ( - get_langchain_prompts(pre_prompt_query, prompt_query, - pre_prompt_summary, prompt_summary, - model_dict['base_model'], - model_dict['inference_server'], - model_dict['model_path_llama'])) - # if mixed setup, choose non-empty so best models best - # FIXME: Make per model dict passed through to evaluate - pre_prompt_query = pre_prompt_query or pre_prompt_query1 - prompt_query = prompt_query or prompt_query1 - pre_prompt_summary = pre_prompt_summary or pre_prompt_summary1 - prompt_summary = prompt_summary or prompt_summary1 - - # try to infer, ignore empty initial state leading to get_generate_params -> 'plain' - if prompt_type_infer: - model_lower1 = model_dict['base_model'].lower() - if model_lower1 in inv_prompt_type_to_model_lower: - model_dict['prompt_type'] = inv_prompt_type_to_model_lower[model_lower1] - model_dict['prompt_dict'], error0 = get_prompt(model_dict['prompt_type'], '', - chat=False, context='', reduced=False, - making_context=False, - return_dict=True, - system_prompt=system_prompt) - else: - model_dict['prompt_dict'] = prompt_dict - else: - model_dict['prompt_dict'] = prompt_dict - model_dict['prompt_dict'] = model_dict.get('prompt_dict', model_dict['prompt_dict']) - # end prompt adjustments - all_kwargs = locals().copy() - all_kwargs.update(model_dict) - if model_dict['base_model'] and not login_mode_if_model0: - model0, tokenizer0, device = get_model(reward_type=False, - **get_kwargs(get_model, exclude_names=['reward_type'], - **all_kwargs)) - else: - # if empty model, then don't load anything, just get gradio up - model0, tokenizer0, device = None, None, None - if model0 is None: - if fail_if_cannot_connect: - raise RuntimeError("Could not connect, see logs") - # skip - if isinstance(model_lock, list): - model_lock.remove(model_dict) - continue - model_state_trial = dict(model=model0, tokenizer=tokenizer0, device=device) - model_state_trial.update(model_dict) - diff_keys = set(list(model_state_none.keys())).symmetric_difference(model_state_trial.keys()) - assert len(model_state_none) == len(model_state_trial), diff_keys - print("Model %s" % model_dict, flush=True) - if model_lock: - # last in iteration will be first - model_states.insert(0, model_state_trial) - # fill model_state0 so go_gradio() easier, manage model_states separately - model_state0 = model_state_trial.copy() - else: - model_state0 = model_state_trial.copy() - assert len(model_state_none) == len(model_state0) - - visible_models = str_to_list(visible_models, allow_none=True) # None means first model - all_models = [x.get('base_model', xi) for xi, x in enumerate(model_states)] - visible_models_state0 = [x.get('base_model', xi) for xi, x in enumerate(model_states) if - visible_models is None or - x.get('base_model', xi) in visible_models or - xi in visible_models] - - # update to be consistent with what is passed from CLI and model chose - # do after go over all models if multi-model, so don't contaminate - # This is just so UI shows reasonable correct value, not 2048 dummy value - if len(model_states) >= 1: - max_seq_len = model_states[0]['tokenizer'].model_max_length - - # get score model - all_kwargs = locals().copy() - smodel, stokenizer, sdevice = get_score_model(reward_type=True, - **get_kwargs(get_score_model, exclude_names=['reward_type'], - **all_kwargs)) - score_model_state0 = dict(model=smodel, tokenizer=stokenizer, device=sdevice, - base_model=score_model, tokenizer_base_model='', lora_weights='', - inference_server='', prompt_type='', prompt_dict='') - - if enable_captions: - if pre_load_caption_model: - from image_captions import H2OImageCaptionLoader - caption_loader = H2OImageCaptionLoader(caption_gpu=caption_gpu).load_model() - else: - caption_loader = 'gpu' if n_gpus > 0 and caption_gpu else 'cpu' - else: - caption_loader = False - - if pre_load_embedding_model and langchain_mode != 'Disabled' and not use_openai_embedding: - from src.gpt_langchain import get_embedding - hf_embedding_model = dict(name=hf_embedding_model, - model=get_embedding(use_openai_embedding, hf_embedding_model=hf_embedding_model, - preload=True)) - if enable_doctr or enable_pdf_ocr in [True, 'auto', 'on']: - doctr_loader = 'gpu' if n_gpus > 0 and doctr_gpu else 'cpu' - else: - doctr_loader = False - - # assume gradio needs everything - go_gradio(**locals()) - - -def get_config(base_model, - use_auth_token=False, - trust_remote_code=True, - offload_folder=None, - revision=None, - rope_scaling=None, - triton_attn=False, - long_sequence=True, - return_model=False, - raise_exception=False, - max_seq_len=None, - verbose=False, - ): - from accelerate import init_empty_weights - with init_empty_weights(): - from transformers import AutoConfig - try: - config = AutoConfig.from_pretrained(base_model, use_auth_token=use_auth_token, - trust_remote_code=trust_remote_code, - offload_folder=offload_folder, - revision=revision, - rope_scaling=rope_scaling if rope_scaling else None) - except OSError as e: - if raise_exception: - raise - if 'not a local folder and is not a valid model identifier listed on' in str( - e) or '404 Client Error' in str(e) or "couldn't connect" in str(e): - # e.g. llama, gpjt, etc. - # e.g. HF TGI but not model on HF or private etc. - if max_seq_len is None and base_model.lower() in non_hf_types: - print("Could not determine --max_seq_len, setting to 2048. Pass if not correct", flush=True) - max_seq_len = 2048 - # HF TGI server only should really require prompt_type, not HF model state - return None, None, max_seq_len - else: - raise - if triton_attn and 'mpt-' in base_model.lower(): - config.attn_config['attn_impl'] = 'triton' - if long_sequence: - if 'mpt-7b-storywriter' in base_model.lower(): - config.update({"max_seq_len": 83968}) - if 'mosaicml/mpt-7b-chat' in base_model.lower(): - config.update({"max_seq_len": 4096}) - if 'mpt-30b' in base_model.lower(): - config.update({"max_seq_len": 2 * 8192}) - if return_model and \ - issubclass(config.__class__, tuple(AutoModel._model_mapping.keys())): - model = AutoModel.from_config( - config, - trust_remote_code=trust_remote_code, - ) - else: - # can't infer - model = None - if 'falcon' in base_model.lower(): - config.use_cache = False - - # allow override - if max_seq_len is not None: - print("Overriding max_seq_len -> %d" % max_seq_len, flush=True) - else: - if hasattr(config, 'max_seq_len'): - max_seq_len = int(config.max_seq_len) - elif hasattr(config, 'max_position_embeddings') and isinstance(config.max_position_embeddings, int): - # help automatically limit inputs to generate - max_seq_len = config.max_position_embeddings - if verbose: - print("Used max_position_embeddings=%s as base model (pre-rope) max_seq_len." - " If not desired, pass --max_seq_len and set to some integer value." % config.max_position_embeddings, - flush=True) - elif hasattr(config, 'n_ctx'): - # e.g. gpt2 - max_seq_len = int(config.n_ctx) - else: - print("Could not determine --max_seq_len, setting to 2048. Pass if not correct", flush=True) - max_seq_len = 2048 - # FIXME: - # raise RuntimeError("Could not determine max_seq_len," - # " please pass --max_seq_len and set to some value, e.g. 2048.") - - if rope_scaling: - if rope_scaling.get('factor'): - # HF transformers - max_seq_len *= rope_scaling.get('factor') - elif rope_scaling.get('alpha_value'): - # exllama - # Note: exllama's own tokenizer has this set correctly in loaders.py, this config will be unused - max_seq_len *= rope_scaling.get('alpha_value') - print("Automatically setting max_seq_len=%d for RoPE scaling" % max_seq_len, flush=True) - - return config, model, max_seq_len - - -def get_non_lora_model(base_model, model_loader, load_half, - load_gptq, - load_exllama, - use_safetensors, - revision, - model_kwargs, reward_type, - config, model, - gpu_id=0, - ): - """ - Ensure model gets on correct device - """ - - if model is not None: - # NOTE: Can specify max_memory={0: max_mem, 1: max_mem}, to shard model - # NOTE: Some models require avoiding sharding some layers, - # then would pass no_split_module_classes and give list of those layers. - from accelerate import infer_auto_device_map - device_map = infer_auto_device_map( - model, - dtype=torch.float16 if load_half else torch.float32, - ) - if hasattr(model, 'model'): - device_map_model = infer_auto_device_map( - model.model, - dtype=torch.float16 if load_half else torch.float32, - ) - device_map.update(device_map_model) - else: - device_map = "auto" - - n_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 0 - n_gpus, gpu_ids = cuda_vis_check(n_gpus) - - if n_gpus > 0: - if gpu_id >= 0: - # FIXME: If really distributes model, tend to get things like: ValueError: gpt_neox.embed_in.weight doesn't have any device set. - # So avoid for now, just put on first GPU, unless score_model, put on last - if reward_type: - device_map = {'': n_gpus - 1} - else: - device_map = {'': min(n_gpus - 1, gpu_id)} - if gpu_id == -1: - device_map = {'': 'cuda'} - else: - device_map = {'': 'cpu'} - model_kwargs['load_in_8bit'] = False - model_kwargs['load_in_4bit'] = False - print('device_map: %s' % device_map, flush=True) - - load_in_8bit = model_kwargs.get('load_in_8bit', False) - load_in_4bit = model_kwargs.get('load_in_4bit', False) - model_kwargs['device_map'] = device_map - model_kwargs['use_safetensors'] = use_safetensors - model_kwargs['revision'] = revision - pop_unused_model_kwargs(model_kwargs) - - if load_exllama: - model = model_loader - elif load_gptq: - if 'Llama-2-70B-chat-GPTQ' in base_model: - model_kwargs.update(dict(inject_fused_attention=False)) - model_kwargs.pop('torch_dtype', None) - model_kwargs.pop('device_map') - model = model_loader( - model_name_or_path=base_model, - model_basename=load_gptq, - **model_kwargs, - ) - elif load_in_8bit or load_in_4bit or not load_half: - model = model_loader( - base_model, - config=config, - **model_kwargs, - ) - else: - - model = model_loader( - base_model, - config=config, - **model_kwargs, - ) - if not getattr(model, "is_quantized", False): - model = model.half() - return model - - -def get_client_from_inference_server(inference_server, base_model=None, raise_connection_exception=False): - inference_server, headers = get_hf_server(inference_server) - # preload client since slow for gradio case especially - from gradio_utils.grclient import GradioClient - gr_client = None - hf_client = None - if headers is None: - try: - print("GR Client Begin: %s %s" % (inference_server, base_model), flush=True) - # first do sanity check if alive, else gradio client takes too long by default - requests.get(inference_server, timeout=int(os.getenv('REQUEST_TIMEOUT', '30'))) - gr_client = GradioClient(inference_server) - print("GR Client End: %s" % inference_server, flush=True) - except (OSError, ValueError) as e: - # Occurs when wrong endpoint and should have been HF client, so don't hard raise, just move to HF - gr_client = None - print("GR Client Failed %s %s: %s" % (inference_server, base_model, str(e)), flush=True) - except (ConnectTimeoutError, ConnectTimeout, MaxRetryError, ConnectionError, ConnectionError2, - JSONDecodeError, ReadTimeout2, KeyError) as e: - t, v, tb = sys.exc_info() - ex = ''.join(traceback.format_exception(t, v, tb)) - print("GR Client Failed %s %s: %s" % (inference_server, base_model, str(ex)), flush=True) - if raise_connection_exception: - raise - - if gr_client is None: - res = None - from text_generation import Client as HFClient - print("HF Client Begin: %s %s" % (inference_server, base_model)) - try: - hf_client = HFClient(inference_server, headers=headers, timeout=int(os.getenv('REQUEST_TIMEOUT', '30'))) - # quick check valid TGI endpoint - res = hf_client.generate('What?', max_new_tokens=1) - hf_client = HFClient(inference_server, headers=headers, timeout=300) - except (ConnectTimeoutError, ConnectTimeout, MaxRetryError, ConnectionError, ConnectionError2, - JSONDecodeError, ReadTimeout2, KeyError) as e: - hf_client = None - t, v, tb = sys.exc_info() - ex = ''.join(traceback.format_exception(t, v, tb)) - print("HF Client Failed %s %s: %s" % (inference_server, base_model, str(ex))) - if raise_connection_exception: - raise - print("HF Client End: %s %s : %s" % (inference_server, base_model, res)) - return inference_server, gr_client, hf_client - - -def get_model( - load_8bit: bool = False, - load_4bit: bool = False, - low_bit_mode: int = 1, - load_half: bool = True, - load_gptq: str = '', - load_exllama: bool = False, - use_safetensors: bool = False, - revision: str = None, - use_gpu_id: bool = True, - base_model: str = '', - inference_server: str = "", - tokenizer_base_model: str = '', - lora_weights: str = "", - gpu_id: int = 0, - n_jobs=None, - - reward_type: bool = None, - local_files_only: bool = False, - resume_download: bool = True, - use_auth_token: Union[str, bool] = False, - trust_remote_code: bool = True, - offload_folder: str = None, - rope_scaling: dict = None, - max_seq_len: int = None, - compile_model: bool = True, - llamacpp_dict=None, - - verbose: bool = False, -): - """ - - :param load_8bit: load model in 8-bit, not supported by all models - :param load_4bit: load model in 4-bit, not supported by all models - :param low_bit_mode: See gen.py - :param load_half: load model in 16-bit - :param load_gptq: GPTQ model_basename - :param load_exllama: whether to use exllama - :param use_safetensors: use safetensors file - :param revision: - :param use_gpu_id: Use torch infer of optimal placement of layers on devices (for non-lora case) - For non-LORA case, False will spread shards across multiple GPUs, but this can lead to cuda:x cuda:y mismatches - So it is not the default - :param base_model: name/path of base model - :param inference_server: whether base_model is hosted locally ('') or via http (url) - :param tokenizer_base_model: name/path of tokenizer - :param lora_weights: name/path - :param gpu_id: which GPU (0..n_gpus-1) or allow all GPUs if relevant (-1) - :param n_jobs: number of cores to use (e.g. for llama CPU model) - :param reward_type: reward type model for sequence classification - :param local_files_only: use local files instead of from HF - :param resume_download: resume downloads from HF - :param use_auth_token: assumes user did on CLI `huggingface-cli login` to access private repo - :param trust_remote_code: trust code needed by model - :param offload_folder: offload folder - :param rope_scaling: scaling for rope-based models, e.g. "{'type':'dynamic', 'factor':4}" - :param max_seq_len: override for maximum sequence length for model - :param max_seq_len: if set, use as max_seq_len for model - :param compile_model: whether to compile torch model - :param llamacpp_dict: dict of llama.cpp and GPT4All model options - :param verbose: - :return: - """ - print("Starting get_model: %s %s" % (base_model, inference_server), flush=True) - - triton_attn = False - long_sequence = True - config_kwargs = dict(use_auth_token=use_auth_token, - trust_remote_code=trust_remote_code, - offload_folder=offload_folder, - rope_scaling=rope_scaling, - triton_attn=triton_attn, - long_sequence=long_sequence, - revision=revision, - max_seq_len=max_seq_len, - verbose=verbose) - config, _, max_seq_len = get_config(base_model, **config_kwargs, raise_exception=False) - - if base_model in non_hf_types: - assert config is None, "Expected config None for %s" % base_model - - llama_type_from_config = 'llama' in str(config).lower() - llama_type_from_name = "llama" in base_model.lower() - llama_type = llama_type_from_config or llama_type_from_name - if "xgen" in base_model.lower() or 'llama2' in base_model.lower() or 'llama-2' in base_model.lower(): - llama_type = False - if llama_type: - if verbose: - print("Detected as llama type from" - " config (%s) or name (%s)" % (llama_type_from_config, llama_type_from_name), flush=True) - - model_name_exllama_if_no_config = '' if not llamacpp_dict else llamacpp_dict.get('model_name_exllama_if_no_config', - '') - model_loader, tokenizer_loader, conditional_type = ( - get_loaders(model_name=base_model, reward_type=reward_type, llama_type=llama_type, - load_gptq=load_gptq, load_exllama=load_exllama, config=config, - rope_scaling=rope_scaling, max_seq_len=max_seq_len, - model_name_exllama_if_no_config=model_name_exllama_if_no_config)) - - tokenizer_kwargs = dict(local_files_only=local_files_only, - resume_download=resume_download, - use_auth_token=use_auth_token, - trust_remote_code=trust_remote_code, - offload_folder=offload_folder, - revision=revision, - padding_side='left', - config=config, - ) - if not tokenizer_base_model: - tokenizer_base_model = base_model - - if load_exllama: - tokenizer = tokenizer_loader - elif config is not None and tokenizer_loader is not None and not isinstance(tokenizer_loader, str): - if load_exllama: - tokenizer = tokenizer_loader - else: - tokenizer = tokenizer_loader.from_pretrained(tokenizer_base_model, **tokenizer_kwargs) - # sets raw (no cushion) limit - # If using RoPE with scaling, then for non-exllama models (e.g. HF models), - # then config -> tokenizer will set model_max_length correctly - set_model_max_len(max_seq_len, tokenizer, verbose=False) - # if using fake tokenizer, not really accurate when lots of numbers, give a bit of buffer, else get: - # Generation Failed: Input validation error: `inputs` must have less than 2048 tokens. Given: 2233 - tokenizer.model_max_length = tokenizer.model_max_length - 50 - else: - tokenizer = None - - if isinstance(inference_server, str) and inference_server.startswith("http"): - inference_server, gr_client, hf_client = get_client_from_inference_server(inference_server, - base_model=base_model) - client = gr_client or hf_client - # Don't return None, None for model, tokenizer so triggers - if tokenizer is None: - # FIXME: Could use only tokenizer from llama etc. but hard to detatch from model, just use fake for now - if os.getenv("HARD_ASSERTS") and base_model not in non_hf_types: - raise RuntimeError("Unexpected tokenizer=None") - tokenizer = FakeTokenizer() - return client, tokenizer, 'http' - if isinstance(inference_server, str) and ( - inference_server.startswith('openai') or - inference_server.startswith('vllm') or - inference_server.startswith('replicate') or - inference_server.startswith('sagemaker') - ): - if inference_server.startswith('openai'): - assert os.getenv('OPENAI_API_KEY'), "Set environment for OPENAI_API_KEY" - # Don't return None, None for model, tokenizer so triggers - # include small token cushion - max_seq_len = model_token_mapping[base_model] - if inference_server.startswith('replicate'): - assert len(inference_server.split(':')) >= 3, "Expected replicate:model string, got %s" % inference_server - assert os.getenv('REPLICATE_API_TOKEN'), "Set environment for REPLICATE_API_TOKEN" - assert max_seq_len is not None, "Please pass --max_seq_len= for replicate models." - try: - import replicate as replicate_python - except ImportError: - raise ImportError( - "Could not import replicate python package. " - "Please install it with `pip install replicate`." - ) - if inference_server.startswith('sagemaker'): - assert len( - inference_server.split( - ':')) >= 3, "Expected sagemaker_chat::, got %s" % inference_server - assert os.getenv('AWS_ACCESS_KEY_ID'), "Set environment for AWS_ACCESS_KEY_ID" - assert os.getenv('AWS_SECRET_ACCESS_KEY'), "Set environment for AWS_SECRET_ACCESS_KEY" - # Don't return None, None for model, tokenizer so triggers - # include small token cushion - if inference_server.startswith('openai') or tokenizer is None: - # don't use fake (tiktoken) tokenizer for vLLM//replicate if know actual model with actual tokenizer - tokenizer = FakeTokenizer(model_max_length=max_seq_len - 50) - return inference_server, tokenizer, inference_server - assert not inference_server, "Malformed inference_server=%s" % inference_server - if base_model in non_hf_types: - from gpt4all_llm import get_model_tokenizer_gpt4all - model, tokenizer, device = get_model_tokenizer_gpt4all(base_model, n_jobs=n_jobs, - max_seq_len=max_seq_len, - llamacpp_dict=llamacpp_dict) - return model, tokenizer, device - if load_exllama: - return model_loader, tokenizer, 'cuda' - - # get local torch-HF model - return get_hf_model(load_8bit=load_8bit, - load_4bit=load_4bit, - low_bit_mode=low_bit_mode, - load_half=load_half, - load_gptq=load_gptq, - use_safetensors=use_safetensors, - revision=revision, - use_gpu_id=use_gpu_id, - base_model=base_model, - tokenizer_base_model=tokenizer_base_model, - lora_weights=lora_weights, - gpu_id=gpu_id, - - reward_type=reward_type, - local_files_only=local_files_only, - resume_download=resume_download, - use_auth_token=use_auth_token, - trust_remote_code=trust_remote_code, - offload_folder=offload_folder, - rope_scaling=rope_scaling, - compile_model=compile_model, - - llama_type=llama_type, - config_kwargs=config_kwargs, - tokenizer_kwargs=tokenizer_kwargs, - - verbose=verbose) - - -def get_hf_model(load_8bit: bool = False, - load_4bit: bool = False, - low_bit_mode: int = 1, - load_half: bool = True, - load_gptq: str = '', - use_safetensors: bool = False, - revision: str = None, - use_gpu_id: bool = True, - base_model: str = '', - tokenizer_base_model: str = '', - lora_weights: str = "", - gpu_id: int = 0, - - reward_type: bool = None, - local_files_only: bool = False, - resume_download: bool = True, - use_auth_token: Union[str, bool] = False, - trust_remote_code: bool = True, - offload_folder: str = None, - rope_scaling: dict = None, - compile_model: bool = True, - - llama_type: bool = False, - config_kwargs=None, - tokenizer_kwargs=None, - - verbose: bool = False, - ): - assert config_kwargs is not None - assert tokenizer_kwargs is not None - - load_exllama = False # Never should be in HF code for exllama - - if lora_weights is not None and lora_weights.strip(): - if verbose: - print("Get %s lora weights" % lora_weights, flush=True) - device = get_device() - - if 'gpt2' in base_model.lower(): - # RuntimeError: where expected condition to be a boolean tensor, but got a tensor with dtype Half - load_8bit = False - load_4bit = False - - assert base_model.strip(), ( - "Please choose a base model with --base_model (CLI) or load one from Models Tab (gradio)" - ) - - model_loader, tokenizer_loader, conditional_type = ( - get_loaders(model_name=base_model, reward_type=reward_type, llama_type=llama_type, - load_gptq=load_gptq, load_exllama=load_exllama)) - - config, _, max_seq_len = get_config(base_model, return_model=False, raise_exception=True, **config_kwargs) - - if tokenizer_loader is not None and not isinstance(tokenizer_loader, str): - if load_exllama: - tokenizer = tokenizer_loader - else: - tokenizer = tokenizer_loader.from_pretrained(tokenizer_base_model, - **tokenizer_kwargs) - else: - tokenizer = tokenizer_loader - - if isinstance(tokenizer, str): - # already a pipeline, tokenizer_loader is string for task - model = model_loader(tokenizer, - model=base_model, - device=0 if device == "cuda" else -1, - torch_dtype=torch.float16 if device == 'cuda' else torch.float32) - else: - assert device in ["cuda", "cpu", "mps"], "Unsupported device %s" % device - model_kwargs = dict(local_files_only=local_files_only, - torch_dtype=torch.float16 if device == 'cuda' else torch.float32, - resume_download=resume_download, - use_auth_token=use_auth_token, - trust_remote_code=trust_remote_code, - offload_folder=offload_folder, - revision=revision, - # rope_scaling=rope_scaling, # only put into config - ) - if 'mbart-' not in base_model.lower() and 'mpt-' not in base_model.lower(): - if use_gpu_id and gpu_id is not None and gpu_id >= 0 and device == 'cuda': - device_map = {"": gpu_id} - else: - device_map = "auto" - model_kwargs.update(dict(load_in_8bit=load_8bit, - load_in_4bit=load_4bit, - device_map=device_map, - )) - if 'mpt-' in base_model.lower() and gpu_id is not None and gpu_id >= 0: - # MPT doesn't support spreading over GPUs - model_kwargs.update(dict(device_map={"": gpu_id} if device == 'cuda' else "cpu")) - - if 'OpenAssistant/reward-model'.lower() in base_model.lower(): - # FIXME: could put on other GPUs - model_kwargs['device_map'] = {"": 0} if device == 'cuda' else {"": 'cpu'} - model_kwargs.pop('torch_dtype', None) - pop_unused_model_kwargs(model_kwargs) - - n_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 0 - n_gpus, gpu_ids = cuda_vis_check(n_gpus) - if low_bit_mode == 1 and n_gpus != 0: - from transformers import BitsAndBytesConfig - model_kwargs['quantization_config'] = BitsAndBytesConfig(bnb_4bit_compute_dtype=torch.bfloat16, - load_in_4bit=load_4bit, - load_in_8bit=load_8bit, - ) - elif low_bit_mode == 2 and n_gpus != 0: - from transformers import BitsAndBytesConfig - model_kwargs['quantization_config'] = BitsAndBytesConfig(bnb_4bit_quant_type="nf4", - load_in_4bit=load_4bit, - load_in_8bit=load_8bit, - ) - elif low_bit_mode == 3 and n_gpus != 0: - from transformers import BitsAndBytesConfig - model_kwargs['quantization_config'] = BitsAndBytesConfig(bnb_4bit_use_double_quant=True, - load_in_4bit=load_4bit, - load_in_8bit=load_8bit, - ) - elif low_bit_mode == 4 and n_gpus != 0: - from transformers import BitsAndBytesConfig - model_kwargs['quantization_config'] = BitsAndBytesConfig(bnb_4bit_use_double_quant=True, - bnb_4bit_quant_type="nf4", - load_in_4bit=load_4bit, - load_in_8bit=load_8bit, - ) - - if not lora_weights: - # torch.device context uses twice memory for AutoGPTQ - context = NullContext if load_gptq else torch.device - with context(device): - - if use_gpu_id: - config, model, max_seq_len = get_config(base_model, - return_model=True, raise_exception=True, **config_kwargs) - model = get_non_lora_model(base_model, model_loader, load_half, load_gptq, - load_exllama, - use_safetensors, - revision, - model_kwargs, reward_type, - config, model, - gpu_id=gpu_id, - ) - else: - config, _, max_seq_len = get_config(base_model, **config_kwargs) - if load_half and not (load_8bit or load_4bit or load_gptq): - model = model_loader( - base_model, - config=config, - **model_kwargs) - if not getattr(model, "is_quantized", False): - model = model.half() - else: - model = model_loader( - base_model, - config=config, - **model_kwargs) - elif load_8bit or load_4bit: - config, _, max_seq_len = get_config(base_model, **config_kwargs) - model = model_loader( - base_model, - config=config, - **model_kwargs - ) - from peft import PeftModel # loads cuda, so avoid in global scope - model = PeftModel.from_pretrained( - model, - lora_weights, - torch_dtype=torch.float16 if device == 'cuda' else torch.float32, - local_files_only=local_files_only, - resume_download=resume_download, - use_auth_token=use_auth_token, - trust_remote_code=trust_remote_code, - offload_folder=offload_folder, - rope_scaling=rope_scaling, - revision=revision, - device_map={"": 0} if device == 'cuda' else {"": 'cpu'}, # seems to be required - ) - else: - with torch.device(device): - config, _, max_seq_len = get_config(base_model, raise_exception=True, **config_kwargs) - model = model_loader( - base_model, - config=config, - **model_kwargs - ) - from peft import PeftModel # loads cuda, so avoid in global scope - model = PeftModel.from_pretrained( - model, - lora_weights, - torch_dtype=torch.float16 if device == 'cuda' else torch.float32, - local_files_only=local_files_only, - resume_download=resume_download, - use_auth_token=use_auth_token, - trust_remote_code=trust_remote_code, - offload_folder=offload_folder, - rope_scaling=rope_scaling, - device_map="auto", - ) - if load_half and not load_gptq: - if not getattr(model, "is_quantized", False): - model = model.half() - - # unwind broken decapoda-research config - if llama_type: - model.config.pad_token_id = tokenizer.pad_token_id = 0 # unk - model.config.bos_token_id = 1 - model.config.eos_token_id = 2 - if 'gpt2' in base_model.lower(): - # add special tokens that otherwise all share the same id - tokenizer.add_special_tokens({'bos_token': '', - 'eos_token': '', - 'pad_token': ''}) - - if not isinstance(tokenizer, str): - model.eval() - if torch.__version__ >= "2" and sys.platform != "win32" and compile_model: - model = torch.compile(model) - - set_model_max_len(max_seq_len, tokenizer, verbose=False, reward_type=reward_type) - - # tell if conditional type - model.conditional_type = conditional_type - tokenizer.conditional_type = conditional_type - - return model, tokenizer, device - - -def set_model_max_len(max_seq_len, tokenizer, verbose=False, reward_type=False): - if reward_type: - # limit deberta, else uses too much memory and not worth response score - tokenizer.model_max_length = 512 - return - - tokenizer.model_max_length = int(max_seq_len) - if verbose: - print("model_max_length=%s" % tokenizer.model_max_length, flush=True) - # for bug in HF transformers - if tokenizer.model_max_length > 100000000: - tokenizer.model_max_length = 2048 - - -def pop_unused_model_kwargs(model_kwargs): - """ - in-place pop unused kwargs that are not dependency-upgrade friendly - no point passing in False, is default, and helps avoid needing to update requirements for new deps - :param model_kwargs: - :return: - """ - check_list = ['load_in_8bit', 'load_in_4bit'] - for k in check_list: - if k in model_kwargs and not model_kwargs[k]: - model_kwargs.pop(k) - - -def get_score_model(score_model: str = None, - load_8bit: bool = False, - load_4bit: bool = False, - low_bit_mode=1, - load_half: bool = True, - load_gptq: str = '', - load_exllama: bool = False, - use_gpu_id: bool = True, - base_model: str = '', - inference_server: str = '', - tokenizer_base_model: str = '', - lora_weights: str = "", - gpu_id: int = 0, - n_jobs=None, - - reward_type: bool = None, - local_files_only: bool = False, - resume_download: bool = True, - use_auth_token: Union[str, bool] = False, - trust_remote_code: bool = True, - offload_folder: str = None, - rope_scaling: dict = None, - compile_model: bool = True, - llamacpp_dict: typing.Dict = None, - - verbose: bool = False, - ): - if score_model is not None and score_model.strip(): - load_8bit = False - load_4bit = False - low_bit_mode = 1 - load_half = False - load_gptq = '' - load_exllama = False - use_safetensors = False - revision = None - base_model = score_model.strip() - tokenizer_base_model = '' - lora_weights = '' - inference_server = '' - llama_type = False - max_seq_len = None - compile_model = False - llamacpp_dict = {} - smodel, stokenizer, sdevice = get_model(reward_type=True, - **get_kwargs(get_model, exclude_names=['reward_type'], **locals())) - else: - smodel, stokenizer, sdevice = None, None, None - return smodel, stokenizer, sdevice - - -def evaluate_fake(*args, **kwargs): - yield dict(response=invalid_key_msg, sources='') - return - - -def evaluate( - model_state, - my_db_state, - selection_docs_state, - requests_state, - # START NOTE: Examples must have same order of parameters - instruction, - iinput, - context, - stream_output, - prompt_type, - prompt_dict, - temperature, - top_p, - top_k, - num_beams, - max_new_tokens, - min_new_tokens, - early_stopping, - max_time, - repetition_penalty, - num_return_sequences, - do_sample, - chat, - instruction_nochat, - iinput_nochat, - langchain_mode, - add_chat_history_to_context, - langchain_action, - langchain_agents, - top_k_docs, - chunk, - chunk_size, - document_subset, - document_choice, - pre_prompt_query, - prompt_query, - pre_prompt_summary, - prompt_summary, - system_prompt, - - image_loaders, - pdf_loaders, - url_loaders, - jq_schema, - visible_models, - h2ogpt_key, - add_search_to_context, - chat_conversation, - text_context_list, - docs_ordering_type, - min_max_new_tokens, - - # END NOTE: Examples must have same order of parameters - captions_model=None, - caption_loader=None, - doctr_loader=None, - pix2struct_loader=None, - async_output=None, - num_async=None, - src_lang=None, - tgt_lang=None, - debug=False, - concurrency_count=None, - save_dir=None, - sanitize_bot_response=False, - model_state0=None, - memory_restriction_level=None, - max_max_new_tokens=None, - is_public=None, - max_max_time=None, - raise_generate_gpu_exceptions=None, - lora_weights=None, - use_llm_if_no_docs=True, - load_db_if_exists=True, - dbs=None, - detect_user_path_changes_every_query=None, - use_openai_embedding=None, - use_openai_model=None, - hf_embedding_model=None, - migrate_embedding_model=None, - auto_migrate_db=None, - cut_distance=None, - db_type=None, - n_jobs=None, - first_para=None, - text_limit=None, - show_accordions=None, - top_k_docs_max_show=None, - show_link_in_sources=None, - verbose=False, - cli=False, - use_cache=None, - auto_reduce_chunks=None, - max_chunks=None, - headsize=None, - model_lock=None, - force_langchain_evaluate=None, - model_state_none=None, - load_exllama=None, - answer_with_sources=None, - append_sources_to_answer=None, - image_loaders_options0=None, - pdf_loaders_options0=None, - url_loaders_options0=None, - jq_schema0=None, - keep_sources_in_context=None, -): - # ensure passed these - assert concurrency_count is not None - assert memory_restriction_level is not None - assert raise_generate_gpu_exceptions is not None - assert use_openai_embedding is not None - assert use_openai_model is not None - assert hf_embedding_model is not None - assert migrate_embedding_model is not None - assert auto_migrate_db is not None - assert db_type is not None - assert top_k_docs is not None and isinstance(top_k_docs, int) - assert chunk is not None and isinstance(chunk, bool) - assert chunk_size is not None and isinstance(chunk_size, int) - assert n_jobs is not None - assert first_para is not None - assert isinstance(add_chat_history_to_context, bool) - assert isinstance(add_search_to_context, bool) - assert load_exllama is not None - # for lazy client (even chat client) - if image_loaders is None: - image_loaders = image_loaders_options0 - if pdf_loaders is None: - pdf_loaders = pdf_loaders_options0 - if url_loaders is None: - url_loaders = url_loaders_options0 - if jq_schema is None: - jq_schema = jq_schema0 - if isinstance(langchain_agents, str): - if langchain_agents.strip().startswith('['): - # already list, but as string - langchain_agents = str_to_list(langchain_agents) - else: - # just 1 item and make list - langchain_agents = [langchain_agents] - chat_conversation = str_to_list(chat_conversation) - text_context_list = str_to_list(text_context_list) - - langchain_modes = selection_docs_state['langchain_modes'] - langchain_mode_paths = selection_docs_state['langchain_mode_paths'] - langchain_mode_types = selection_docs_state['langchain_mode_types'] - - if debug: - locals_dict = locals().copy() - locals_dict.pop('model_state', None) - locals_dict.pop('model_state0', None) - locals_dict.pop('model_states', None) - print(locals_dict) - - no_model_msg = "Please choose a base model with --base_model (CLI) or load in Models Tab (gradio).\n" \ - "Then start New Conversation" - - if model_state is None: - model_state = model_state_none.copy() - if model_state0 is None: - # e.g. for no gradio case, set dummy value, else should be set - model_state0 = model_state_none.copy() - - # model_state['model] is only 'model' if should use model_state0 - # model could also be None - have_model_lock = model_lock is not None - have_fresh_model = model_state['model'] not in [None, 'model', no_model_str] - # for gradio UI control, expect model_state and model_state0 to match, so if have_model_lock=True, then should have_fresh_model=True - # but gradio API control will only use nochat api etc. and won't use fresh model, so can't assert in general - # if have_model_lock: - # assert have_fresh_model, "Expected model_state and model_state0 to match if have_model_lock" - have_cli_model = model_state0['model'] not in [None, 'model', no_model_str] - - if have_fresh_model: - # USE FRESH MODEL - if not have_model_lock: - # model_state0 is just one of model_state if model_lock, so don't nuke - # try to free-up original model (i.e. list was passed as reference) - if model_state0['model'] and hasattr(model_state0['model'], 'cpu'): - model_state0['model'].cpu() - model_state0['model'] = None - # try to free-up original tokenizer (i.e. list was passed as reference) - if model_state0['tokenizer']: - model_state0['tokenizer'] = None - clear_torch_cache() - chosen_model_state = model_state - elif have_cli_model: - # USE MODEL SETUP AT CLI - assert isinstance(model_state['model'], (type(None), str)) # expect no fresh model - chosen_model_state = model_state0 - else: - raise AssertionError(no_model_msg) - # get variables - model = chosen_model_state['model'] - tokenizer = chosen_model_state['tokenizer'] - device = chosen_model_state['device'] - base_model = chosen_model_state['base_model'] - tokenizer_base_model = chosen_model_state['tokenizer_base_model'] - lora_weights = chosen_model_state['lora_weights'] - inference_server = chosen_model_state['inference_server'] - visible_models = chosen_model_state['visible_models'] - # use overall key if have, so key for this gradio and any inner gradio - if chosen_model_state['h2ogpt_key'] is not None: - h2ogpt_key = chosen_model_state['h2ogpt_key'] - # prefer use input from API over model state - prompt_type = prompt_type or chosen_model_state['prompt_type'] - prompt_dict = prompt_dict or chosen_model_state['prompt_dict'] - - if base_model is None: - raise AssertionError(no_model_msg) - - assert base_model.strip(), no_model_msg - assert model, "Model is missing" - assert tokenizer, "Tokenizer is missing" - - # choose chat or non-chat mode - if not chat: - instruction = instruction_nochat - iinput = iinput_nochat - - # in some cases, like lean nochat API, don't want to force sending prompt_type, allow default choice - model_lower = base_model.lower() - if not prompt_type and model_lower in inv_prompt_type_to_model_lower and prompt_type != 'custom': - prompt_type = inv_prompt_type_to_model_lower[model_lower] - if verbose: - print("Auto-selecting prompt_type=%s for %s" % (prompt_type, model_lower), flush=True) - assert prompt_type is not None, "prompt_type was None" - - # Control generation hyperparameters - # adjust for bad inputs, e.g. in case also come from API that doesn't get constrained by gradio sliders - # below is for TGI server, not required for HF transformers - # limits are chosen similar to gradio_runner.py sliders/numbers - top_p = min(max(1e-3, top_p), 1.0 - 1e-3) - top_k = min(max(1, int(top_k)), 100) - temperature = min(max(0.01, temperature), 2.0) - # FIXME: https://github.com/h2oai/h2ogpt/issues/106 - num_beams = 1 if stream_output else num_beams # See max_beams in gradio_runner - max_max_new_tokens = get_max_max_new_tokens(chosen_model_state, - memory_restriction_level=memory_restriction_level, - max_new_tokens=max_new_tokens, - max_max_new_tokens=max_max_new_tokens) - if min_max_new_tokens is None: - # default for nochat api - min_max_new_tokens = 256 - if docs_ordering_type is None: - docs_ordering_type = 'reverse_ucurve_sort' - model_max_length = get_model_max_length(chosen_model_state) - max_new_tokens = min(max(1, int(max_new_tokens)), max_max_new_tokens) - min_new_tokens = min(max(0, int(min_new_tokens)), max_new_tokens) - max_time = min(max(0, max_time), max_max_time) - repetition_penalty = min(max(0.01, repetition_penalty), 3.0) - num_return_sequences = 1 if chat else min(max(1, int(num_return_sequences)), 10) - min_top_k_docs, max_top_k_docs, label_top_k_docs = get_minmax_top_k_docs(is_public) - # limit total tokens processed, e.g. for summarization, if public instance - if is_public: - total_tokens_for_docs = min(2 * model_max_length, 16384) - else: - total_tokens_for_docs = None - top_k_docs = min(max(min_top_k_docs, int(top_k_docs)), max_top_k_docs) - chunk_size = min(max(128, int(chunk_size)), 2048) - if not context: - context = '' - - # get prompter - prompter = Prompter(prompt_type, prompt_dict, debug=debug, chat=chat, stream_output=stream_output, - system_prompt=system_prompt) - - # THIRD PLACE where LangChain referenced, but imports only occur if enabled and have db to use - assert langchain_mode in langchain_modes, "Invalid langchain_mode %s not in %s" % (langchain_mode, langchain_modes) - assert langchain_action in langchain_actions, "Invalid langchain_action %s not in %s" % ( - langchain_action, langchain_actions) - assert len( - set(langchain_agents).difference(langchain_agents_list)) == 0, "Invalid langchain_agents %s" % langchain_agents - - # get db, but also fill db state so return already has my_db_state and dbs filled so faster next query - from src.gpt_langchain import get_any_db - db = get_any_db(my_db_state, langchain_mode, langchain_mode_paths, langchain_mode_types, - dbs=dbs, - load_db_if_exists=load_db_if_exists, - db_type=db_type, - use_openai_embedding=use_openai_embedding, - hf_embedding_model=hf_embedding_model, - migrate_embedding_model=migrate_embedding_model, - auto_migrate_db=auto_migrate_db, - for_sources_list=True, - verbose=verbose, - n_jobs=n_jobs, - ) - - t_generate = time.time() - langchain_only_model = base_model in non_hf_types or \ - load_exllama or \ - inference_server.startswith('replicate') or \ - inference_server.startswith('sagemaker') or \ - inference_server.startswith('openai_azure_chat') or \ - inference_server.startswith('openai_azure') - do_langchain_path = langchain_mode not in [False, 'Disabled', 'LLM'] or \ - langchain_only_model or \ - force_langchain_evaluate or \ - len(text_context_list) > 0 - - if len(langchain_agents) > 0: - do_langchain_path = True - if add_search_to_context: - # easier to manage prompt etc. by doing full langchain path - do_langchain_path = True - - if do_langchain_path: - text = '' - sources = '' - response = '' - # use smaller cut_distance for wiki_full since so many matches could be obtained, and often irrelevant unless close - from gpt_langchain import run_qa_db - gen_hyper_langchain = dict(do_sample=do_sample, - temperature=temperature, - repetition_penalty=repetition_penalty, - top_k=top_k, - top_p=top_p, - num_beams=num_beams, - min_new_tokens=min_new_tokens, - max_new_tokens=max_new_tokens, - early_stopping=early_stopping, - max_time=max_time, - num_return_sequences=num_return_sequences, - ) - loaders_dict, captions_model = gr_to_lg(image_loaders, - pdf_loaders, - url_loaders, - captions_model=captions_model, - ) - loaders_dict.update(dict(captions_model=captions_model, - caption_loader=caption_loader, - doctr_loader=doctr_loader, - pix2struct_loader=pix2struct_loader, - jq_schema=jq_schema, - )) - data_point = dict(context=context, instruction=instruction, input=iinput) - # no longer stuff chat history directly into context this early - prompt_basic = prompter.generate_prompt(data_point, context_from_history=False) - prompt = prompt_basic - num_prompt_tokens = 0 - for r in run_qa_db( - inference_server=inference_server, - model_name=base_model, model=model, tokenizer=tokenizer, - langchain_only_model=langchain_only_model, - async_output=async_output, - num_async=num_async, - prompter=prompter, - use_llm_if_no_docs=use_llm_if_no_docs, - load_db_if_exists=load_db_if_exists, - db=db, - langchain_mode_paths=langchain_mode_paths, - langchain_mode_types=langchain_mode_types, - detect_user_path_changes_every_query=detect_user_path_changes_every_query, - cut_distance=1.1 if langchain_mode in ['wiki_full'] else cut_distance, - answer_with_sources=answer_with_sources, - append_sources_to_answer=append_sources_to_answer, - add_chat_history_to_context=add_chat_history_to_context, - add_search_to_context=add_search_to_context, - keep_sources_in_context=keep_sources_in_context, - memory_restriction_level=memory_restriction_level, - system_prompt=system_prompt, - use_openai_embedding=use_openai_embedding, - use_openai_model=use_openai_model, - hf_embedding_model=hf_embedding_model, - migrate_embedding_model=migrate_embedding_model, - auto_migrate_db=auto_migrate_db, - first_para=first_para, - text_limit=text_limit, - show_accordions=show_accordions, - top_k_docs_max_show=top_k_docs_max_show, - show_link_in_sources=show_link_in_sources, - - # evaluate args items - query=instruction, - iinput=iinput, - context=context, - stream_output=stream_output, - chunk=chunk, - chunk_size=chunk_size, - - **loaders_dict, - - langchain_mode=langchain_mode, - langchain_action=langchain_action, - langchain_agents=langchain_agents, - document_subset=document_subset, - document_choice=document_choice, - top_k_docs=top_k_docs, - prompt_type=prompt_type, - prompt_dict=prompt_dict, - pre_prompt_query=pre_prompt_query, - prompt_query=prompt_query, - pre_prompt_summary=pre_prompt_summary, - prompt_summary=prompt_summary, - text_context_list=text_context_list, - chat_conversation=chat_conversation, - visible_models=visible_models, - h2ogpt_key=h2ogpt_key, - docs_ordering_type=docs_ordering_type, - min_max_new_tokens=min_max_new_tokens, - - **gen_hyper_langchain, - - db_type=db_type, - n_jobs=n_jobs, - verbose=verbose, - cli=cli, - sanitize_bot_response=sanitize_bot_response, - - lora_weights=lora_weights, - - auto_reduce_chunks=auto_reduce_chunks, - max_chunks=max_chunks, - total_tokens_for_docs=total_tokens_for_docs, - headsize=headsize, - ): - # doesn't accumulate, new answer every yield, so only save that full answer - response = r['response'] - sources = r['sources'] - prompt = r['prompt'] - num_prompt_tokens = r['num_prompt_tokens'] - yield dict(response=response, sources=sources, save_dict=dict()) - if save_dir: - # estimate using tiktoken - extra_dict = gen_hyper_langchain.copy() - extra_dict.update(prompt_type=prompt_type, - inference_server=inference_server, - langchain_mode=langchain_mode, - langchain_action=langchain_action, - langchain_agents=langchain_agents, - document_subset=document_subset, - document_choice=document_choice, - chat_conversation=chat_conversation, - add_search_to_context=add_search_to_context, - num_prompt_tokens=num_prompt_tokens, - instruction=instruction, - iinput=iinput, - context=context, - t_generate=time.time() - t_generate, - ntokens=None, - tokens_persecond=None, - ) - save_dict = dict(prompt=prompt, - output=response, base_model=base_model, save_dir=save_dir, - where_from='run_qa_db', - extra_dict=extra_dict) - yield dict(response=response, sources=sources, save_dict=save_dict) - if verbose: - print( - 'Post-Generate Langchain: %s decoded_output: %s' % - (str(datetime.now()), len(response) if response else -1), - flush=True) - if response or sources or langchain_only_model: - # if got no response (e.g. not showing sources and got no sources, - # so nothing to give to LLM), then slip through and ask LLM - # Or if llama/gptj, then just return since they had no response and can't go down below code path - # don't clear torch cache here, delays multi-generation, and bot(), all_bot(), and evaluate_nochat() do it - return - - # NOT LANGCHAIN PATH, raw LLM - # restrict instruction + , typically what has large input - prompt, \ - instruction, iinput, context, \ - num_prompt_tokens, max_new_tokens, num_prompt_tokens0, num_prompt_tokens_actual, \ - chat_index, top_k_docs_trial, one_doc_size = \ - get_limited_prompt(instruction, - iinput, - tokenizer, - prompter=prompter, - inference_server=inference_server, - # prompt_type=prompt_type, - # prompt_dict=prompt_dict, - # chat=chat, - max_new_tokens=max_new_tokens, - # system_prompt=system_prompt, - context=context, - chat_conversation=chat_conversation, - keep_sources_in_context=keep_sources_in_context, - model_max_length=model_max_length, - memory_restriction_level=memory_restriction_level, - langchain_mode=langchain_mode, - add_chat_history_to_context=add_chat_history_to_context, - min_max_new_tokens=min_max_new_tokens, - ) - - if inference_server.startswith('vllm') or \ - inference_server.startswith('openai') or \ - inference_server.startswith('http'): - if inference_server.startswith('vllm') or inference_server.startswith('openai'): - assert not inference_server.startswith('openai_azure_chat'), "Not fo Azure, use langchain path" - assert not inference_server.startswith('openai_azure'), "Not for Azure, use langchain path" - openai, inf_type, deployment_name, base_url, api_version = set_openai(inference_server) - where_from = inf_type - - terminate_response = prompter.terminate_response or [] - stop_sequences = list(set(terminate_response + [prompter.PreResponse])) - stop_sequences = [x for x in stop_sequences if x] - # OpenAI will complain if ask for too many new tokens, takes it as min in some sense, wrongly so. - max_new_tokens_openai = min(max_new_tokens, model_max_length - num_prompt_tokens) - gen_server_kwargs = dict(temperature=temperature if do_sample else 0, - max_tokens=max_new_tokens_openai, - top_p=top_p if do_sample else 1, - frequency_penalty=0, - n=num_return_sequences, - presence_penalty=1.07 - repetition_penalty + 0.6, # so good default - ) - if inf_type == 'vllm' or inference_server == 'openai': - responses = openai.Completion.create( - model=base_model, - prompt=prompt, - **gen_server_kwargs, - stop=stop_sequences, - stream=stream_output, - ) - text = '' - sources = '' - response = '' - if not stream_output: - text = responses['choices'][0]['text'] - response = prompter.get_response(prompt + text, prompt=prompt, - sanitize_bot_response=sanitize_bot_response) - yield dict(response=response, sources=sources, save_dict=dict()) - else: - collected_events = [] - for event in responses: - collected_events.append(event) # save the event response - event_text = event['choices'][0]['text'] # extract the text - text += event_text # append the text - response = prompter.get_response(prompt + text, prompt=prompt, - sanitize_bot_response=sanitize_bot_response) - yield dict(response=response, sources=sources, save_dict=dict()) - elif inf_type == 'vllm_chat' or inference_server == 'openai_chat': - if inf_type == 'vllm_chat': - raise NotImplementedError('%s not supported by vLLM' % inf_type) - if system_prompt in [None, 'None', 'auto']: - openai_system_prompt = "You are a helpful assistant." - else: - openai_system_prompt = system_prompt - messages0 = [] - if openai_system_prompt: - messages0.append({"role": "system", "content": openai_system_prompt}) - messages0.append({'role': 'user', 'content': prompt}) - responses = openai.ChatCompletion.create( - model=base_model, - messages=messages0, - stream=stream_output, - **gen_server_kwargs, - ) - text = "" - sources = '' - response = "" - if not stream_output: - text = responses["choices"][0]["message"]["content"] - response = prompter.get_response(prompt + text, prompt=prompt, - sanitize_bot_response=sanitize_bot_response) - yield dict(response=response, sources=sources, save_dict=dict()) - else: - for chunk in responses: - delta = chunk["choices"][0]["delta"] - if 'content' in delta: - text += delta['content'] - response = prompter.get_response(prompt + text, prompt=prompt, - sanitize_bot_response=sanitize_bot_response) - yield dict(response=response, sources=sources, save_dict=dict()) - else: - raise RuntimeError("No such OpenAI mode: %s" % inference_server) - elif inference_server.startswith('http'): - inference_server, headers = get_hf_server(inference_server) - from gradio_utils.grclient import GradioClient - from text_generation import Client as HFClient - if isinstance(model, GradioClient): - gr_client = model - hf_client = None - elif isinstance(model, HFClient): - gr_client = None - hf_client = model - else: - inference_server, gr_client, hf_client = get_client_from_inference_server(inference_server, - base_model=base_model) - - # quick sanity check to avoid long timeouts, just see if can reach server - requests.get(inference_server, timeout=int(os.getenv('REQUEST_TIMEOUT_FAST', '10'))) - - if gr_client is not None: - # Note: h2oGPT gradio server could handle input token size issues for prompt, - # but best to handle here so send less data to server - - chat_client = False - where_from = "gr_client" - client_langchain_mode = 'Disabled' - client_add_chat_history_to_context = True - client_add_search_to_context = False - client_langchain_action = LangChainAction.QUERY.value - client_langchain_agents = [] - gen_server_kwargs = dict(temperature=temperature, - top_p=top_p, - top_k=top_k, - num_beams=num_beams, - max_new_tokens=max_new_tokens, - min_new_tokens=min_new_tokens, - early_stopping=early_stopping, - max_time=max_time, - repetition_penalty=repetition_penalty, - num_return_sequences=num_return_sequences, - do_sample=do_sample, - chat=chat_client, - ) - # account for gradio into gradio that handles prompting, avoid duplicating prompter prompt injection - if prompt_type in [None, '', PromptType.plain.name, PromptType.plain.value, - str(PromptType.plain.value)]: - # if our prompt is plain, assume either correct or gradio server knows different prompt type, - # so pass empty prompt_Type - gr_prompt_type = '' - gr_prompt_dict = '' - gr_prompt = prompt # already prepared prompt - gr_context = '' - gr_iinput = '' - else: - # if already have prompt_type that is not plain, None, or '', then already applied some prompting - # But assume server can handle prompting, and need to avoid double-up. - # Also assume server can do better job of using stopping.py to stop early, so avoid local prompting, let server handle - # So avoid "prompt" and let gradio server reconstruct from prompt_type we passed - # Note it's ok that prompter.get_response() has prompt+text, prompt=prompt passed, - # because just means extra processing and removal of prompt, but that has no human-bot prompting doesn't matter - # since those won't appear - gr_context = context - gr_prompt = instruction - gr_iinput = iinput - gr_prompt_type = prompt_type - gr_prompt_dict = prompt_dict - client_kwargs = dict(instruction=gr_prompt if chat_client else '', # only for chat=True - iinput=gr_iinput, # only for chat=True - context=gr_context, - # streaming output is supported, loops over and outputs each generation in streaming mode - # but leave stream_output=False for simple input/output mode - stream_output=stream_output, - - **gen_server_kwargs, - - prompt_type=gr_prompt_type, - prompt_dict=gr_prompt_dict, - - instruction_nochat=gr_prompt if not chat_client else '', - iinput_nochat=gr_iinput, # only for chat=False - langchain_mode=client_langchain_mode, - add_chat_history_to_context=client_add_chat_history_to_context, - langchain_action=client_langchain_action, - langchain_agents=client_langchain_agents, - top_k_docs=top_k_docs, - chunk=chunk, - chunk_size=chunk_size, - document_subset=DocumentSubset.Relevant.name, - document_choice=[DocumentChoice.ALL.value], - pre_prompt_query=pre_prompt_query, - prompt_query=prompt_query, - pre_prompt_summary=pre_prompt_summary, - prompt_summary=prompt_summary, - system_prompt=system_prompt, - image_loaders=image_loaders, - pdf_loaders=pdf_loaders, - url_loaders=url_loaders, - jq_schema=jq_schema, - visible_models=visible_models, - h2ogpt_key=h2ogpt_key, - add_search_to_context=client_add_search_to_context, - docs_ordering_type=None, - min_max_new_tokens=min_max_new_tokens, - ) - api_name = '/submit_nochat_api' # NOTE: like submit_nochat but stable API for string dict passing - response = '' - text = '' - sources = '' - if not stream_output: - res = gr_client.predict(str(dict(client_kwargs)), api_name=api_name) - res_dict = ast.literal_eval(res) - text = res_dict['response'] - sources = res_dict['sources'] - response = prompter.get_response(prompt + text, prompt=prompt, - sanitize_bot_response=sanitize_bot_response) - yield dict(response=response, sources=sources, save_dict=dict()) - else: - job = gr_client.submit(str(dict(client_kwargs)), api_name=api_name) - res_dict = dict(response=text, sources=sources, save_dict=dict()) - text0 = '' - while not job.done(): - if job.communicator.job.latest_status.code.name == 'FINISHED': - break - e = job.future._exception - if e is not None: - break - outputs_list = job.communicator.job.outputs - if outputs_list: - res = job.communicator.job.outputs[-1] - res_dict = ast.literal_eval(res) - text = res_dict['response'] - sources = res_dict['sources'] - if gr_prompt_type == 'plain': - # then gradio server passes back full prompt + text - prompt_and_text = text - else: - prompt_and_text = prompt + text - response = prompter.get_response(prompt_and_text, prompt=prompt, - sanitize_bot_response=sanitize_bot_response) - text_chunk = response[len(text0):] - if not text_chunk: - continue - # save old - text0 = response - yield dict(response=response, sources=sources, save_dict=dict()) - time.sleep(0.01) - # ensure get last output to avoid race - res_all = job.outputs() - if len(res_all) > 0: - res = res_all[-1] - res_dict = ast.literal_eval(res) - text = res_dict['response'] - sources = res_dict['sources'] - else: - # go with old text if last call didn't work - e = job.future._exception - if e is not None: - stre = str(e) - strex = ''.join(traceback.format_tb(e.__traceback__)) - else: - stre = '' - strex = '' - - print("Bad final response: %s %s %s %s %s: %s %s" % (base_model, inference_server, - res_all, prompt, text, stre, strex), - flush=True) - if gr_prompt_type == 'plain': - # then gradio server passes back full prompt + text - prompt_and_text = text - else: - prompt_and_text = prompt + text - response = prompter.get_response(prompt_and_text, prompt=prompt, - sanitize_bot_response=sanitize_bot_response) - yield dict(response=response, sources=sources, save_dict=dict()) - elif hf_client: - # HF inference server needs control over input tokens - where_from = "hf_client" - response = '' - extra = '' - sources = '' - - # prompt must include all human-bot like tokens, already added by prompt - # https://github.com/huggingface/text-generation-inference/tree/main/clients/python#types - terminate_response = prompter.terminate_response or [] - stop_sequences = list(set(terminate_response + [prompter.PreResponse])) - stop_sequences = [x for x in stop_sequences if x] - gen_server_kwargs = dict(do_sample=do_sample, - max_new_tokens=max_new_tokens, - # best_of=None, - repetition_penalty=repetition_penalty, - return_full_text=False, - seed=SEED, - stop_sequences=stop_sequences, - temperature=temperature, - top_k=top_k, - top_p=top_p, - # truncate=False, # behaves oddly - # typical_p=top_p, - # watermark=False, - # decoder_input_details=False, - ) - # work-around for timeout at constructor time, will be issue if multi-threading, - # so just do something reasonable or max_time if larger - # lower bound because client is re-used if multi-threading - hf_client.timeout = max(300, max_time) - if not stream_output: - text = hf_client.generate(prompt, **gen_server_kwargs).generated_text - response = prompter.get_response(prompt + text, prompt=prompt, - sanitize_bot_response=sanitize_bot_response) - yield dict(response=response, sources=sources, save_dict=dict()) - else: - text = "" - for responses in hf_client.generate_stream(prompt, **gen_server_kwargs): - if not responses.token.special: - # stop_sequences - text_chunk = responses.token.text - text += text_chunk - response = prompter.get_response(prompt + text, prompt=prompt, - sanitize_bot_response=sanitize_bot_response) - sources = '' - yield dict(response=response, sources=sources, save_dict=dict()) - else: - raise RuntimeError("Failed to get client: %s" % inference_server) - else: - raise RuntimeError("No such inference_server %s" % inference_server) - - if save_dir and text: - # save prompt + new text - extra_dict = gen_server_kwargs.copy() - extra_dict.update(dict(inference_server=inference_server, num_prompt_tokens=num_prompt_tokens, - t_generate=time.time() - t_generate, - ntokens=None, - tokens_persecond=None, - )) - save_dict = dict(prompt=prompt, output=text, base_model=base_model, save_dir=save_dir, - where_from=where_from, extra_dict=extra_dict) - yield dict(response=response, sources=sources, save_dict=save_dict) - return - else: - assert not inference_server, "inference_server=%s not supported" % inference_server - - if isinstance(tokenizer, str): - # pipeline - if tokenizer == "summarization": - key = 'summary_text' - else: - raise RuntimeError("No such task type %s" % tokenizer) - # NOTE: uses max_length only - sources = '' - yield dict(response=model(prompt, max_length=max_new_tokens)[0][key], sources=sources, save_dict=dict()) - - if 'mbart-' in base_model.lower(): - assert src_lang is not None - tokenizer.src_lang = languages_covered()[src_lang] - - stopping_criteria = get_stopping(prompt_type, prompt_dict, tokenizer, device, base_model, - model_max_length=model_max_length, - prompter=prompter) - - inputs = tokenizer(prompt, return_tensors="pt") - if debug and len(inputs["input_ids"]) > 0: - print('input_ids length', len(inputs["input_ids"][0]), flush=True) - input_ids = inputs["input_ids"].to(device) - # CRITICAL LIMIT else will fail - max_max_tokens = tokenizer.model_max_length - max_input_tokens = max(0, int(max_max_tokens - min_new_tokens)) - # NOTE: Don't limit up front due to max_new_tokens, let go up to max or reach max_max_tokens in stopping.py - assert isinstance(max_input_tokens, int), "Bad type for max_input_tokens=%s %s" % ( - max_input_tokens, type(max_input_tokens)) - input_ids = input_ids[:, -max_input_tokens:] - # required for falcon if multiple threads or asyncio accesses to model during generation - if use_cache is None: - use_cache = False if 'falcon' in base_model else True - gen_config_kwargs = dict(num_beams=num_beams, - do_sample=do_sample, - repetition_penalty=float(repetition_penalty), - num_return_sequences=num_return_sequences, - renormalize_logits=True, - remove_invalid_values=True, - use_cache=use_cache, - ) - if do_sample: - gen_config_kwargs.update(dict(temperature=float(temperature), - top_p=float(top_p), - top_k=top_k)) - if True: - # unclear impact, some odd things going on inside - # leads to: - # The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results. - # Setting `pad_token_id` to `eos_token_id`:2 for open-end generation. - # or leads to: - # Using cls_token, but it is not set yet. - # Using mask_token, but it is not set yet. - # Using pad_token, but it is not set yet. - # Using sep_token, but it is not set yet. - token_ids = ['eos_token_id', 'pad_token_id', 'bos_token_id', 'cls_token_id', 'sep_token_id'] - for token_id in token_ids: - if hasattr(tokenizer, token_id) and getattr(tokenizer, token_id) is not None: - gen_config_kwargs.update({token_id: getattr(tokenizer, token_id)}) - generation_config = GenerationConfig(**gen_config_kwargs) - - gen_kwargs = dict(input_ids=input_ids, - generation_config=generation_config, - return_dict_in_generate=True, - output_scores=True, - max_new_tokens=max_new_tokens, # prompt + new - min_new_tokens=min_new_tokens, # prompt + new - early_stopping=early_stopping, # False, True, "never" - max_time=max_time, - stopping_criteria=stopping_criteria, - ) - if 'gpt2' in base_model.lower(): - gen_kwargs.update(dict(bos_token_id=tokenizer.bos_token_id, pad_token_id=tokenizer.eos_token_id)) - elif 'mbart-' in base_model.lower(): - assert tgt_lang is not None - tgt_lang = languages_covered()[tgt_lang] - gen_kwargs.update(dict(forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang])) - else: - token_ids = ['eos_token_id', 'bos_token_id', 'pad_token_id'] - for token_id in token_ids: - if hasattr(tokenizer, token_id) and getattr(tokenizer, token_id) is not None: - gen_kwargs.update({token_id: getattr(tokenizer, token_id)}) - - decoder_kwargs = dict(skip_special_tokens=True, - clean_up_tokenization_spaces=True) - - decoder = functools.partial(tokenizer.decode, - **decoder_kwargs - ) - with torch.no_grad(): - have_lora_weights = lora_weights not in [no_lora_str, '', None] - context_class_cast = NullContext if device == 'cpu' or have_lora_weights or device == 'mps' else torch.autocast - if t5_type(base_model): - # issues when casting to float16, can mess up t5 model, e.g. only when not streaming, or other odd behaviors - context_class_cast = NullContext - with context_class_cast(device): - # protection for gradio not keeping track of closed users, - # else hit bitsandbytes lack of thread safety: - # https://github.com/h2oai/h2ogpt/issues/104 - # but only makes sense if concurrency_count == 1 - context_class = NullContext # if concurrency_count > 1 else filelock.FileLock - if verbose: - print('Pre-Generate: %s' % str(datetime.now()), flush=True) - decoded_output = None - response = '' - with context_class("generate.lock"): - if verbose: - print('Generate: %s' % str(datetime.now()), flush=True) - always_use_streaming_method = True # to deal with complex parsing of prompt vs. generation due to odd tokenizing - if stream_output or always_use_streaming_method: - skip_prompt = True # True means first output excludes prompt - streamer = H2OTextIteratorStreamer(tokenizer, skip_prompt=skip_prompt, block=False, - **decoder_kwargs) - gen_kwargs.update(dict(streamer=streamer)) - target = wrapped_partial(generate_with_exceptions, model.generate, - raise_generate_gpu_exceptions=raise_generate_gpu_exceptions, - **gen_kwargs) - bucket = queue.Queue() - thread = EThread(target=target, streamer=streamer, bucket=bucket) - thread.start() - ret = dict(response='', sources='', save_dict=dict()) - outputs = "" - sources = '' - try: - for new_text in streamer: - if bucket.qsize() > 0 or thread.exc: - thread.join() - outputs += new_text - response = prompter.get_response(outputs, prompt=None, - only_new_text=True, - sanitize_bot_response=sanitize_bot_response) - ret = dict(response=response, sources=sources, save_dict=dict()) - if stream_output: - yield ret - if not stream_output: - yield ret - except BaseException: - # if any exception, raise that exception if was from thread, first - if thread.exc: - raise thread.exc - raise - finally: - # don't clear torch cache here, delays multi-generation, and bot(), all_bot(), and evaluate_nochat() do it - # in case no exception and didn't join with thread yet, then join - if not thread.exc: - thread.join() - # in case raise StopIteration or broke queue loop in streamer, but still have exception - if thread.exc: - raise thread.exc - decoded_output = outputs - ntokens = len(outputs) // 4 # hack for now - else: - # below length removal doesn't work in general, because encoding does not match internal of model generation - input_ids_len = gen_kwargs['input_ids'][0].shape[0] - try: - outputs = model.generate(**gen_kwargs) - finally: - pass - # don't clear torch cache here, delays multi-generation, and bot(), all_bot(), and evaluate_nochat() do it - # skip first IDs - ntokens = sum([len(s) - input_ids_len for s in outputs.sequences]) if save_dir else -1 - outputs = [decoder(s[input_ids_len:]) for s in outputs.sequences] - sources = '' - response = prompter.get_response(outputs, prompt=None, - only_new_text=True, - sanitize_bot_response=sanitize_bot_response) - yield dict(response=response, sources=sources, save_dict=dict()) - if outputs and len(outputs) >= 1: - decoded_output = prompt + outputs[0] - if save_dir and decoded_output: - extra_dict = gen_config_kwargs.copy() - extra_dict.update(dict(num_prompt_tokens=num_prompt_tokens, - t_generate=time.time() - t_generate, - ntokens=ntokens, - tokens_persecond=ntokens / (time.time() - t_generate), - )) - save_dict = dict(prompt=prompt, output=decoded_output, base_model=base_model, save_dir=save_dir, - where_from="evaluate_%s" % str(stream_output), - extra_dict=extra_dict) - yield dict(response=response, sources=sources, save_dict=save_dict) - if verbose: - print('Post-Generate: %s decoded_output: %s' % ( - str(datetime.now()), len(decoded_output) if decoded_output else -1), flush=True) - - -inputs_list_names = list(inspect.signature(evaluate).parameters) -state_names = input_args_list.copy() # doesn't have to be the same, but state_names must match evaluate() and how filled then -inputs_kwargs_list = [x for x in inputs_list_names if x not in eval_func_param_names + state_names] - - -def get_cutoffs(memory_restriction_level, for_context=False, model_max_length=2048): - # help to avoid errors like: - # RuntimeError: The size of tensor a (2048) must match the size of tensor b (2049) at non-singleton dimension 3 - # RuntimeError: expected scalar type Half but found Float - # with - 256 - if memory_restriction_level > 0: - max_length_tokenize = 768 - 256 if memory_restriction_level <= 2 else 512 - 256 - else: - # at least give room for 1 paragraph output - max_length_tokenize = model_max_length - 256 - cutoff_len = max_length_tokenize * 4 # if reaches limit, then can't generate new tokens - output_smallest = 30 * 4 - max_prompt_length = cutoff_len - output_smallest - - if for_context: - # then lower even more to avoid later chop, since just estimate tokens in context bot - max_prompt_length = max(64, int(max_prompt_length * 0.8)) - - return cutoff_len, output_smallest, max_length_tokenize, max_prompt_length - - -class H2OTextIteratorStreamer(TextIteratorStreamer): - """ - normally, timeout required for now to handle exceptions, else get() - but with H2O version of TextIteratorStreamer, loop over block to handle - """ - - def __init__(self, tokenizer, skip_prompt: bool = False, timeout: typing.Optional[float] = None, - block=True, **decode_kwargs): - super().__init__(tokenizer, skip_prompt, **decode_kwargs) - self.text_queue = queue.Queue() - self.stop_signal = None - self.do_stop = False - self.timeout = timeout - self.block = block - - def on_finalized_text(self, text: str, stream_end: bool = False): - """Put the new text in the queue. If the stream is ending, also put a stop signal in the queue.""" - self.text_queue.put(text, timeout=self.timeout) - if stream_end: - self.text_queue.put(self.stop_signal, timeout=self.timeout) - - def __iter__(self): - return self - - def __next__(self): - while True: - try: - value = self.stop_signal # value looks unused in pycharm, not true - if self.do_stop: - print("hit stop", flush=True) - # could raise or break, maybe best to raise and make parent see if any exception in thread - self.clear_queue() - self.do_stop = False - raise StopIteration() - # break - value = self.text_queue.get(block=self.block, timeout=self.timeout) - break - except queue.Empty: - time.sleep(0.01) - if value == self.stop_signal: - self.clear_queue() - self.do_stop = False - raise StopIteration() - else: - return value - - def clear_queue(self): - # make sure streamer is reusable after stop hit - with self.text_queue.mutex: - self.text_queue.queue.clear() - - def put(self, value): - """ - Receives tokens, decodes them, and prints them to stdout as soon as they form entire words. - # same as base class, except remove hack w.r.t. text.rfind(" ") that ruins LLaMa2 - """ - if len(value.shape) > 1 and value.shape[0] > 1: - raise ValueError("TextStreamer only supports batch size 1") - elif len(value.shape) > 1: - value = value[0] - - if self.skip_prompt and self.next_tokens_are_prompt: - self.next_tokens_are_prompt = False - return - - # Add the new token to the cache and decodes the entire thing. - self.token_cache.extend(value.tolist()) - text = self.tokenizer.decode(self.token_cache, **self.decode_kwargs) - - # After the symbol for a new line, we flush the cache. - if text.endswith("\n"): - printable_text = text[self.print_len:] - self.token_cache = [] - self.print_len = 0 - # If the last token is a CJK character, we print the characters. - elif len(text) > 0 and self._is_chinese_char(ord(text[-1])): - printable_text = text[self.print_len:] - self.print_len += len(printable_text) - # Otherwise, prints until the last space char (simple heuristic to avoid printing incomplete words, - # which may change with the subsequent token -- there are probably smarter ways to do this!) - elif len(text) > 0 and text[-1] == '�': - printable_text = text[self.print_len: text.rfind(" ") + 1] - self.print_len += len(printable_text) - else: - printable_text = text[self.print_len:] - self.print_len += len(printable_text) - - self.on_finalized_text(printable_text) - - -def generate_with_exceptions(func, *args, raise_generate_gpu_exceptions=True, **kwargs): - try: - func(*args, **kwargs) - except torch.cuda.OutOfMemoryError as e: - print("GPU OOM 2: exception: %s" % str(e), - flush=True) - if 'input_ids' in kwargs: - if kwargs['input_ids'] is not None: - kwargs['input_ids'].cpu() - kwargs['input_ids'] = None - traceback.print_exc() - clear_torch_cache() - return - except (Exception, RuntimeError) as e: - if 'Expected all tensors to be on the same device' in str(e) or \ - 'expected scalar type Half but found Float' in str(e) or \ - 'probability tensor contains either' in str(e) or \ - 'cublasLt ran into an error!' in str(e) or \ - 'mat1 and mat2 shapes cannot be multiplied' in str(e): - print( - "GPU Error: exception: %s" % str(e), - flush=True) - traceback.print_exc() - clear_torch_cache() - if raise_generate_gpu_exceptions: - raise - return - else: - clear_torch_cache() - if raise_generate_gpu_exceptions: - raise - - -def get_generate_params(model_lower, - chat, - stream_output, show_examples, - prompt_type, prompt_dict, - system_prompt, - pre_prompt_query, prompt_query, - pre_prompt_summary, prompt_summary, - temperature, top_p, top_k, num_beams, - max_new_tokens, min_new_tokens, early_stopping, max_time, - repetition_penalty, num_return_sequences, - do_sample, - top_k_docs, chunk, chunk_size, - image_loaders, - pdf_loaders, - url_loaders, - jq_schema, - docs_ordering_type, - min_max_new_tokens, - verbose, - ): - use_defaults = False - use_default_examples = True - examples = [] - task_info = 'LLM' - if model_lower: - print(f"Using Model {model_lower}", flush=True) - else: - if verbose: - print("No model defined yet", flush=True) - - min_new_tokens = min_new_tokens if min_new_tokens is not None else 0 - early_stopping = early_stopping if early_stopping is not None else False - max_time_defaults = 60 * 3 - max_time = max_time if max_time is not None else max_time_defaults - - if not prompt_type and model_lower in inv_prompt_type_to_model_lower and prompt_type != 'custom': - prompt_type = inv_prompt_type_to_model_lower[model_lower] - if verbose: - print("Auto-selecting prompt_type=%s for %s" % (prompt_type, model_lower), flush=True) - - # examples at first don't include chat, instruction_nochat, iinput_nochat, added at end - if show_examples is None: - if chat: - show_examples = False - else: - show_examples = True - - summarize_example1 = """Jeff: Can I train a ? Transformers model on Amazon SageMaker? -Philipp: Sure you can use the new Hugging Face Deep Learning Container. -Jeff: ok. -Jeff: and how can I get started? -Jeff: where can I find documentation? -Philipp: ok, ok you can find everything here. https://huggingface.co/blog/the-partnership-amazon-sagemaker-and-hugging-face""" - - use_placeholder_instruction_as_example = False - if 'bart-large-cnn-samsum' in model_lower or 'flan-t5-base-samsum' in model_lower: - placeholder_instruction = summarize_example1 - placeholder_input = "" - use_defaults = True - use_default_examples = False - use_placeholder_instruction_as_example = True - task_info = "Summarization" - elif 't5-' in model_lower or 't5' == model_lower or 'flan-' in model_lower: - placeholder_instruction = "The square root of x is the cube root of y. What is y to the power of 2, if x = 4?" - placeholder_input = "" - use_defaults = True - use_default_examples = True - task_info = "Multi-Task: Q/A, translation, Chain-of-Thought, Logical Reasoning, Summarization, etc. Best to use task prefix as trained on, e.g. `translate English to German: ` (space after colon)" - elif 'mbart-' in model_lower: - placeholder_instruction = "The girl has long hair." - placeholder_input = "" - use_defaults = True - use_default_examples = False - use_placeholder_instruction_as_example = True - elif 'gpt2' in model_lower: - placeholder_instruction = "The sky is" - placeholder_input = "" - prompt_type = prompt_type or 'plain' - use_default_examples = True # some will be odd "continuations" but can be ok - use_placeholder_instruction_as_example = True - task_info = "Auto-complete phrase, code, etc." - use_defaults = True - else: - if chat: - placeholder_instruction = "" - else: - placeholder_instruction = "Give detailed answer for whether Einstein or Newton is smarter." - placeholder_input = "" - if not prompt_type and model_lower in inv_prompt_type_to_model_lower and prompt_type != 'custom': - prompt_type = inv_prompt_type_to_model_lower[model_lower] - elif model_lower: - # default is plain, because might rely upon trust_remote_code to handle prompting - prompt_type = prompt_type or 'plain' - else: - prompt_type = '' - task_info = "No task" - if prompt_type == 'instruct': - task_info = "Answer question or follow imperative as instruction with optionally input." - elif prompt_type == 'plain': - task_info = "Auto-complete phrase, code, etc." - elif prompt_type == 'human_bot': - if chat: - task_info = "Chat (Shift-Enter to give question/imperative, input concatenated with instruction)" - else: - task_info = "Ask question/imperative (input concatenated with instruction)" - - # revert to plain if still nothing - prompt_type = prompt_type or 'plain' - if use_defaults: - temperature = 1.0 if temperature is None else temperature - top_p = 1.0 if top_p is None else top_p - top_k = 40 if top_k is None else top_k - num_beams = num_beams or 1 - max_new_tokens = max_new_tokens or 512 - repetition_penalty = repetition_penalty or 1.07 - num_return_sequences = min(num_beams, num_return_sequences or 1) - do_sample = False if do_sample is None else do_sample - else: - temperature = 0.1 if temperature is None else temperature - top_p = 0.75 if top_p is None else top_p - top_k = 40 if top_k is None else top_k - num_beams = num_beams or 1 - max_new_tokens = max_new_tokens or 1024 - repetition_penalty = repetition_penalty or 1.07 - num_return_sequences = min(num_beams, num_return_sequences or 1) - do_sample = False if do_sample is None else do_sample - # doesn't include chat, instruction_nochat, iinput_nochat, added later - params_list = ["", - stream_output, - prompt_type, prompt_dict, - temperature, top_p, top_k, num_beams, - max_new_tokens, min_new_tokens, - early_stopping, max_time, repetition_penalty, num_return_sequences, do_sample] - - if use_placeholder_instruction_as_example: - examples += [[placeholder_instruction, ''] + params_list] - - if use_default_examples: - examples += [ - ["Translate English to French", "Good morning"] + params_list, - ["Give detailed answer for whether Einstein or Newton is smarter.", ''] + params_list, - ["Explain in detailed list, all the best practices for coding in python.", ''] + params_list, - [ - "Create a markdown table with 3 rows for the primary colors, and 2 columns, with color name and hex codes.", - ''] + params_list, - ['Translate to German: My name is Arthur', ''] + params_list, - ["Please answer to the following question. Who is going to be the next Ballon d'or?", ''] + params_list, - ['Can Geoffrey Hinton have a conversation with George Washington? Give the rationale before answering.', - ''] + params_list, - ['Please answer the following question. What is the boiling point of Nitrogen?', ''] + params_list, - ['Answer the following yes/no question. Can you write a whole Haiku in a single tweet?', ''] + params_list, - ["Simplify the following expression: (False or False and True). Explain your answer.", ''] + params_list, - [ - "Premise: At my age you will probably have learnt one lesson. Hypothesis: It's not certain how many lessons you'll learn by your thirties. Does the premise entail the hypothesis?", - ''] + params_list, - ['The square root of x is the cube root of y. What is y to the power of 2, if x = 4?', ''] + params_list, - [ - 'Answer the following question by reasoning step by step. The cafeteria had 23 apples. If they used 20 for lunch, and bought 6 more, how many apple do they have?', - ''] + params_list, - ["""def area_of_rectangle(a: float, b: float): - \"\"\"Return the area of the rectangle.\"\"\"""", ''] + params_list, - ["""# a function in native python: -def mean(a): - return sum(a)/len(a) - -# the same function using numpy: -import numpy as np -def mean(a):""", ''] + params_list, - ["""X = np.random.randn(100, 100) -y = np.random.randint(0, 1, 100) - -# fit random forest classifier with 20 estimators""", ''] + params_list, - ] - # add summary example - examples += [ - [summarize_example1, 'Summarize' if prompt_type not in ['plain', 'instruct_simple'] else ''] + params_list] - - src_lang = "English" - tgt_lang = "Russian" - - # move to correct position - for example in examples: - example += [chat, '', '', LangChainMode.DISABLED.value, True, - LangChainAction.QUERY.value, [], - top_k_docs, chunk, chunk_size, DocumentSubset.Relevant.name, [], - pre_prompt_query, prompt_query, - pre_prompt_summary, prompt_summary, - system_prompt, - image_loaders, - pdf_loaders, - url_loaders, - jq_schema, - None, - None, - False, - None, - None, - docs_ordering_type, - min_max_new_tokens, - ] - # adjust examples if non-chat mode - if not chat: - example[eval_func_param_names.index('instruction_nochat')] = example[ - eval_func_param_names.index('instruction')] - example[eval_func_param_names.index('instruction')] = '' - - example[eval_func_param_names.index('iinput_nochat')] = example[eval_func_param_names.index('iinput')] - example[eval_func_param_names.index('iinput')] = '' - assert len(example) == len(eval_func_param_names), "Wrong example: %s %s" % ( - len(example), len(eval_func_param_names)) - - if prompt_type == PromptType.custom.name and not prompt_dict: - raise ValueError("Unexpected to get non-empty prompt_dict=%s for prompt_type=%s" % (prompt_dict, prompt_type)) - - # get prompt_dict from prompt_type, so user can see in UI etc., or for custom do nothing except check format - prompt_dict, error0 = get_prompt(prompt_type, prompt_dict, - chat=False, context='', reduced=False, making_context=False, return_dict=True, - system_prompt=system_prompt) - if error0: - raise RuntimeError("Prompt wrong: %s" % error0) - - return placeholder_instruction, placeholder_input, \ - stream_output, show_examples, \ - prompt_type, prompt_dict, \ - temperature, top_p, top_k, num_beams, \ - max_new_tokens, min_new_tokens, early_stopping, max_time, \ - repetition_penalty, num_return_sequences, \ - do_sample, \ - src_lang, tgt_lang, \ - examples, \ - task_info - - -def languages_covered(): - # https://huggingface.co/facebook/mbart-large-50-many-to-many-mmt#languages-covered - covered = """Arabic (ar_AR), Czech (cs_CZ), German (de_DE), English (en_XX), Spanish (es_XX), Estonian (et_EE), Finnish (fi_FI), French (fr_XX), Gujarati (gu_IN), Hindi (hi_IN), Italian (it_IT), Japanese (ja_XX), Kazakh (kk_KZ), Korean (ko_KR), Lithuanian (lt_LT), Latvian (lv_LV), Burmese (my_MM), Nepali (ne_NP), Dutch (nl_XX), Romanian (ro_RO), Russian (ru_RU), Sinhala (si_LK), Turkish (tr_TR), Vietnamese (vi_VN), Chinese (zh_CN), Afrikaans (af_ZA), Azerbaijani (az_AZ), Bengali (bn_IN), Persian (fa_IR), Hebrew (he_IL), Croatian (hr_HR), Indonesian (id_ID), Georgian (ka_GE), Khmer (km_KH), Macedonian (mk_MK), Malayalam (ml_IN), Mongolian (mn_MN), Marathi (mr_IN), Polish (pl_PL), Pashto (ps_AF), Portuguese (pt_XX), Swedish (sv_SE), Swahili (sw_KE), Tamil (ta_IN), Telugu (te_IN), Thai (th_TH), Tagalog (tl_XX), Ukrainian (uk_UA), Urdu (ur_PK), Xhosa (xh_ZA), Galician (gl_ES), Slovene (sl_SI)""" - covered = covered.split(', ') - covered = {x.split(' ')[0]: x.split(' ')[1].replace(')', '').replace('(', '') for x in covered} - return covered - - -def score_qa(smodel, stokenizer, max_length_tokenize, question, answer, cutoff_len): - question = question[-cutoff_len:] - answer = answer[-cutoff_len:] - - inputs = stokenizer(question, answer, - return_tensors="pt", - truncation=True, - max_length=max_length_tokenize).to(smodel.device) - try: - score = torch.sigmoid(smodel(**inputs.to(smodel.device)).logits[0].float()).cpu().detach().numpy()[0] - except torch.cuda.OutOfMemoryError as e: - print("GPU OOM 3: question: %s answer: %s exception: %s" % (question, answer, str(e)), flush=True) - del inputs - traceback.print_exc() - clear_torch_cache() - return 'Response Score: GPU OOM' - except (Exception, RuntimeError) as e: - if 'Expected all tensors to be on the same device' in str(e) or \ - 'expected scalar type Half but found Float' in str(e) or \ - 'probability tensor contains either' in str(e) or \ - 'cublasLt ran into an error!' in str(e) or \ - 'device-side assert triggered' in str(e): - print("GPU Error: question: %s answer: %s exception: %s" % (question, answer, str(e)), - flush=True) - traceback.print_exc() - clear_torch_cache() - return 'Response Score: GPU Error' - else: - raise - os.environ['TOKENIZERS_PARALLELISM'] = 'true' - return score - - -def check_locals(**kwargs): - # ensure everything in evaluate is here - can_skip_because_locally_generated = no_default_param_names + [ - # get_model: - 'reward_type' - ] - for k in eval_func_param_names: - if k in can_skip_because_locally_generated: - continue - assert k in kwargs, "Missing %s" % k - for k in inputs_kwargs_list: - if k in can_skip_because_locally_generated: - continue - assert k in kwargs, "Missing %s" % k - - for k in list(inspect.signature(get_model).parameters): - if k in can_skip_because_locally_generated: - continue - assert k in kwargs, "Missing %s" % k - - -def get_model_max_length(model_state): - if not isinstance(model_state['tokenizer'], (str, type(None))): - return model_state['tokenizer'].model_max_length - else: - return 2048 - - -def get_max_max_new_tokens(model_state, **kwargs): - if not isinstance(model_state['tokenizer'], (str, type(None))): - max_max_new_tokens = model_state['tokenizer'].model_max_length - else: - max_max_new_tokens = None - - if kwargs['max_max_new_tokens'] is not None and max_max_new_tokens is not None: - return min(max_max_new_tokens, kwargs['max_max_new_tokens']) - elif kwargs['max_max_new_tokens'] is not None: - return kwargs['max_max_new_tokens'] - elif kwargs['memory_restriction_level'] == 1: - return 768 - elif kwargs['memory_restriction_level'] == 2: - return 512 - elif kwargs['memory_restriction_level'] >= 3: - return 256 - else: - # FIXME: Need to update after new model loaded, so user can control with slider - return 2048 - - -def get_minmax_top_k_docs(is_public): - if is_public: - min_top_k_docs = 1 - max_top_k_docs = 8 - label_top_k_docs = "Number of document chunks" - else: - min_top_k_docs = -1 - max_top_k_docs = 100 - label_top_k_docs = "Number of document chunks (-1 = auto fill model context)" - return min_top_k_docs, max_top_k_docs, label_top_k_docs - - -def merge_chat_conversation_history(chat_conversation1, history): - # chat_conversation and history ordered so largest index of list is most recent - if chat_conversation1: - chat_conversation1 = str_to_list(chat_conversation1) - for conv1 in chat_conversation1: - assert isinstance(conv1, (list, tuple)) - assert len(conv1) == 2 - - if isinstance(history, list): - # make copy so only local change - if chat_conversation1: - # so priority will be newest that comes from actual chat history from UI, then chat_conversation - history = chat_conversation1 + history.copy() - elif chat_conversation1: - history = chat_conversation1 - else: - history = [] - return history - - -def history_to_context(history, langchain_mode=None, - add_chat_history_to_context=None, - prompt_type=None, prompt_dict=None, chat=None, model_max_length=None, - memory_restriction_level=None, keep_sources_in_context=None, - system_prompt=None, chat_conversation=None): - """ - consumes all history up to (but not including) latest history item that is presumed to be an [instruction, None] pair - :param history: - :param langchain_mode: - :param add_chat_history_to_context: - :param prompt_type: - :param prompt_dict: - :param chat: - :param model_max_length: - :param memory_restriction_level: - :param keep_sources_in_context: - :param system_prompt: - :param chat_conversation: - :return: - """ - history = merge_chat_conversation_history(chat_conversation, history) - - if len(history) >= 1 and len(history[-1]) >= 2 and not history[-1][1]: - len_history = len(history) - 1 - else: - # full history - len_history = len(history) - - # ensure output will be unique to models - _, _, _, max_prompt_length = get_cutoffs(memory_restriction_level, - for_context=True, model_max_length=model_max_length) - context1 = '' - if max_prompt_length is not None and add_chat_history_to_context: - context1 = '' - # - 1 below because current instruction already in history from user() - for histi in range(0, len_history): - data_point = dict(instruction=history[histi][0], input='', output=history[histi][1]) - prompt, pre_response, terminate_response, chat_sep, chat_turn_sep = \ - generate_prompt(data_point, - prompt_type, - prompt_dict, - chat, - reduced=True, - making_context=True, - system_prompt=system_prompt, - histi=histi) - # md -> back to text, maybe not super important if model trained enough - if not keep_sources_in_context and langchain_mode != 'Disabled' and prompt.find(super_source_prefix) >= 0: - # FIXME: This is relatively slow even for small amount of text, like 0.3s each history item - import re - prompt = re.sub(f'{re.escape(super_source_prefix)}.*?{re.escape(super_source_postfix)}', '', prompt, - flags=re.DOTALL) - if prompt.endswith('\n

'): - prompt = prompt[:-4] - prompt = prompt.replace('
', chat_turn_sep) - if not prompt.endswith(chat_turn_sep): - prompt += chat_turn_sep - # most recent first, add older if can - # only include desired chat history - if len(prompt + context1) > max_prompt_length: - break - context1 += prompt - - _, pre_response, terminate_response, chat_sep, chat_turn_sep = \ - generate_prompt({}, prompt_type, prompt_dict, - chat, reduced=True, - making_context=True, - system_prompt=system_prompt, - histi=-1) - if context1 and not context1.endswith(chat_turn_sep): - context1 += chat_turn_sep # ensure if terminates abruptly, then human continues on next line - return context1 - - -def get_limited_prompt(instruction, - iinput, - tokenizer, - prompter=None, - inference_server=None, - prompt_type=None, prompt_dict=None, chat=False, max_new_tokens=None, - system_prompt='', - context='', chat_conversation=None, text_context_list=None, - keep_sources_in_context=False, - model_max_length=None, memory_restriction_level=0, - langchain_mode=None, add_chat_history_to_context=True, - verbose=False, - doc_importance=0.5, - min_max_new_tokens=256, - ): - if prompter: - prompt_type = prompter.prompt_type - prompt_dict = prompter.prompt_dict - chat = prompter.chat - stream_output = prompter.stream_output - system_prompt = prompter.system_prompt - - # merge handles if chat_conversation is None - history = [] - history = merge_chat_conversation_history(chat_conversation, history) - history_to_context_func = functools.partial(history_to_context, - langchain_mode=langchain_mode, - add_chat_history_to_context=add_chat_history_to_context, - prompt_type=prompt_type, - prompt_dict=prompt_dict, - chat=chat, - model_max_length=model_max_length, - memory_restriction_level=memory_restriction_level, - keep_sources_in_context=keep_sources_in_context, - system_prompt=system_prompt) - context2 = history_to_context_func(history) - context1 = context - if context1 is None: - context1 = '' - - from h2oai_pipeline import H2OTextGenerationPipeline - data_point_just_instruction = dict(context='', instruction=instruction, input='') - prompt_just_instruction = prompter.generate_prompt(data_point_just_instruction) - instruction, num_instruction_tokens = H2OTextGenerationPipeline.limit_prompt(instruction, tokenizer) - num_instruction_tokens_real = get_token_count(prompt_just_instruction, tokenizer) - num_instruction_tokens += (num_instruction_tokens_real - num_instruction_tokens) - - context1, num_context1_tokens = H2OTextGenerationPipeline.limit_prompt(context1, tokenizer) - context2, num_context2_tokens = H2OTextGenerationPipeline.limit_prompt(context2, tokenizer) - iinput, num_iinput_tokens = H2OTextGenerationPipeline.limit_prompt(iinput, tokenizer) - if text_context_list is None: - text_context_list = [] - num_doc_tokens = sum([get_token_count(x + '\n\n', tokenizer) for x in text_context_list]) - - num_prompt_tokens0 = (num_instruction_tokens or 0) + \ - (num_context1_tokens or 0) + \ - (num_context2_tokens or 0) + \ - (num_iinput_tokens or 0) + \ - (num_doc_tokens or 0) - - # go down to no less than 256, about 1 paragraph - # use max_new_tokens before use num_prompt_tokens0 else would be negative or ~0 - min_max_new_tokens = min(min_max_new_tokens, max_new_tokens) - # by default assume can handle all chat and docs - chat_index = 0 - - # allowed residual is either half of what is allowed if doc exceeds half, or is rest of what doc didn't consume - num_non_doc_tokens = num_prompt_tokens0 - num_doc_tokens - # to doc first then non-doc, shouldn't matter much either way - doc_max_length = max(model_max_length - num_non_doc_tokens, doc_importance * model_max_length) - top_k_docs, one_doc_size, num_doc_tokens = get_docs_tokens(tokenizer, text_context_list=text_context_list, - max_input_tokens=doc_max_length) - non_doc_max_length = max(model_max_length - num_doc_tokens, (1.0 - doc_importance) * model_max_length) - - if num_non_doc_tokens > non_doc_max_length: - # need to limit in some way, keep portion of history but all of context and instruction - # 1) drop iinput (unusual to include anyways) - # 2) reduce history - # 3) reduce context1 - # 4) limit instruction so will fit - diff1 = non_doc_max_length - ( - num_instruction_tokens + num_context1_tokens + num_context2_tokens + min_max_new_tokens) - diff2 = non_doc_max_length - (num_instruction_tokens + num_context1_tokens + min_max_new_tokens) - diff3 = non_doc_max_length - (num_instruction_tokens + min_max_new_tokens) - diff4 = non_doc_max_length - min_max_new_tokens - if diff1 > 0: - # then should be able to do #1 - iinput = '' - num_iinput_tokens = 0 - elif diff2 > 0 > diff1: - # then may be able to do #1 + #2 - iinput = '' - num_iinput_tokens = 0 - chat_index_final = len(history) - for chat_index in range(len(history)): - # NOTE: history and chat_conversation are older for first entries - # FIXME: This is a slow for many short conversations - context2 = history_to_context_func(history[chat_index:]) - num_context2_tokens = get_token_count(context2, tokenizer) - diff1 = non_doc_max_length - ( - num_instruction_tokens + num_context1_tokens + num_context2_tokens + min_max_new_tokens) - if diff1 > 0: - chat_index_final = chat_index - if verbose: - print("chat_conversation used %d out of %d" % (chat_index, len(history)), flush=True) - break - chat_index = chat_index_final # i.e. if chat_index == len(history), then nothing can be consumed - elif diff3 > 0 > diff2: - # then may be able to do #1 + #2 + #3 - iinput = '' - num_iinput_tokens = 0 - context2 = '' - num_context2_tokens = 0 - context1, num_context1_tokens = H2OTextGenerationPipeline.limit_prompt(context1, tokenizer, - max_prompt_length=diff3) - if num_context1_tokens <= diff3: - pass - else: - print("failed to reduce", flush=True) - else: - # then must be able to do #1 + #2 + #3 + #4 - iinput = '' - num_iinput_tokens = 0 - context2 = '' - num_context2_tokens = 0 - context1 = '' - num_context1_tokens = 0 - # diff4 accounts for real prompting for instruction - # FIXME: history_to_context could include instruction, in case system prompt long, we overcount and could have more free tokens - instruction, num_instruction_tokens = H2OTextGenerationPipeline.limit_prompt(instruction, tokenizer, - max_prompt_length=diff4) - # get actual tokens - data_point_just_instruction = dict(context='', instruction=instruction, input='') - prompt_just_instruction = prompter.generate_prompt(data_point_just_instruction) - num_instruction_tokens_real = get_token_count(prompt_just_instruction, tokenizer) - num_instruction_tokens += (num_instruction_tokens_real - num_instruction_tokens) - - # update full context - context = context1 + context2 - # update token counts (docs + non-docs, all tokens) - num_prompt_tokens = (num_instruction_tokens or 0) + \ - (num_context1_tokens or 0) + \ - (num_context2_tokens or 0) + \ - (num_iinput_tokens or 0) + \ - (num_doc_tokens or 0) - - # update max_new_tokens - if inference_server and inference_server.startswith('http'): - # assume TGI/Gradio setup to consume tokens and have long output too, even if exceeds model capacity. - pass - else: - # limit so max_new_tokens = prompt + new < max - # otherwise model can fail etc. e.g. for distilgpt2 asking for 1024 tokens is enough to fail if prompt=1 token - max_new_tokens = min(max_new_tokens, model_max_length - num_prompt_tokens) - - if prompter is None: - # get prompter - debug = False - stream_output = False # doesn't matter - prompter = Prompter(prompt_type, prompt_dict, debug=debug, chat=chat, stream_output=stream_output, - system_prompt=system_prompt) - - data_point = dict(context=context, instruction=instruction, input=iinput) - # handle promptA/promptB addition if really from history. - # if not from history, then reduced=False inside correct - # if mixed, then no specific correct thing to do, so treat like history and promptA/B will come first still - context_from_history = len(history) > 0 and len(context1) > 0 - prompt = prompter.generate_prompt(data_point, context_from_history=context_from_history) - num_prompt_tokens_actual = get_token_count(prompt, tokenizer) - - return prompt, \ - instruction, iinput, context, \ - num_prompt_tokens, max_new_tokens, num_prompt_tokens0, num_prompt_tokens_actual, \ - chat_index, top_k_docs, one_doc_size - - -def get_docs_tokens(tokenizer, text_context_list=[], max_input_tokens=None): - if text_context_list is None or len(text_context_list) == 0: - return 0, None, 0 - if max_input_tokens is None: - max_input_tokens = tokenizer.model_max_length - tokens = [get_token_count(x + '\n\n', tokenizer) for x in text_context_list] - tokens_cumsum = np.cumsum(tokens) - where_res = np.where(tokens_cumsum < max_input_tokens)[0] - # if below condition fails, then keep top_k_docs=-1 and trigger special handling next - if where_res.shape[0] > 0: - top_k_docs = 1 + where_res[-1] - one_doc_size = None - num_doc_tokens = tokens_cumsum[top_k_docs - 1] # by index - else: - # if here, means 0 and just do best with 1 doc - top_k_docs = 1 - text_context_list = text_context_list[:top_k_docs] - # critical protection - from src.h2oai_pipeline import H2OTextGenerationPipeline - doc_content = text_context_list[0] - doc_content, new_tokens0 = H2OTextGenerationPipeline.limit_prompt(doc_content, - tokenizer, - max_prompt_length=max_input_tokens) - text_context_list[0] = doc_content - one_doc_size = len(doc_content) - num_doc_tokens = get_token_count(doc_content + '\n\n', tokenizer) - print("Unexpected large chunks and can't add to context, will add 1 anyways. Tokens %s -> %s" % ( - tokens[0], new_tokens0), flush=True) - return top_k_docs, one_doc_size, num_doc_tokens - - -def entrypoint_main(): - """ - Examples: - - WORLD_SIZE=4 CUDA_VISIBLE_DEVICES="0,1,2,3" torchrun --nproc_per_node=4 --master_port=1234 generate.py --base_model='EleutherAI/gpt-j-6B' --lora_weights=lora-alpaca_6B - python generate.py --base_model='EleutherAI/gpt-j-6B' --lora_weights='lora-alpaca_6B' - python generate.py --base_model='EleutherAI/gpt-neox-20b' --lora_weights='lora-alpaca_20B' - - # generate without lora weights, no prompt - python generate.py --base_model='EleutherAI/gpt-neox-20b' --prompt_type='plain' - python generate.py --base_model='togethercomputer/GPT-NeoXT-Chat-Base-20B' --prompt_type='dai_faq' - - python generate.py --base_model='togethercomputer/GPT-NeoXT-Chat-Base-20B' --prompt_type='dai_faq' --lora_weights='lora_20B_daifaq' - # OpenChatKit settings: - python generate.py --base_model='togethercomputer/GPT-NeoXT-Chat-Base-20B' --prompt_type='human_bot --debug=True --num_beams=1 --temperature=0.6 --top_k=40 --top_p=1.0 - - python generate.py --base_model='distilgpt2' --prompt_type='plain' --debug=True --num_beams=1 --temperature=0.6 --top_k=40 --top_p=1.0 --share=False - python generate.py --base_model='t5-large' --prompt_type='simple_instruct' - python generate.py --base_model='philschmid/bart-large-cnn-samsum' - python generate.py --base_model='philschmid/flan-t5-base-samsum' - python generate.py --base_model='facebook/mbart-large-50-many-to-many-mmt' - - python generate.py --base_model='togethercomputer/GPT-NeoXT-Chat-Base-20B' --prompt_type='human_bot' --lora_weights='GPT-NeoXT-Chat-Base-20B.merged.json.8_epochs.57b2892c53df5b8cefac45f84d019cace803ef26.28' - - must have 4*48GB GPU and run without 8bit in order for sharding to work with use_gpu_id=False - can also pass --prompt_type='human_bot' and model can somewhat handle instructions without being instruct tuned - python generate.py --base_model=decapoda-research/llama-65b-hf --load_8bit=False --use_gpu_id=False --prompt_type='human_bot' - - python generate.py --base_model=h2oai/h2ogpt-oig-oasst1-512-6_9b - """ - H2O_Fire(main) - - -if __name__ == "__main__": - entrypoint_main()