h2ogpt-chatbot2

Runtime error

App Files Files Community

pseudotensor commited on Jul 15, 2023

Commit

b368114

1 Parent(s): b64f5c9

Update with h2oGPT hash c37e5ee65166e4d964435193d5d8c23aaa8d3f09

Browse files

Files changed (8) hide show

client_test.py +20 -7
enums.py +8 -0
evaluate_params.py +1 -0
gen.py +59 -23
gpt_langchain.py +145 -38
gradio_runner.py +15 -1
prompter.py +25 -0
utils.py +17 -1

client_test.py CHANGED Viewed

@@ -69,6 +69,7 @@ def get_args(prompt, prompt_type, chat=False, stream_output=False,
              top_k_docs=3,
              langchain_mode='Disabled',
              langchain_action=LangChainAction.QUERY.value,
              prompt_dict=None):
     from collections import OrderedDict
     kwargs = OrderedDict(instruction=prompt if chat else '',  # only for chat=True
@@ -95,6 +96,7 @@ def get_args(prompt, prompt_type, chat=False, stream_output=False,
                          iinput_nochat='',  # only for chat=False
                          langchain_mode=langchain_mode,
                          langchain_action=langchain_action,
                          top_k_docs=top_k_docs,
                          chunk=True,
                          chunk_size=512,
@@ -203,6 +205,7 @@ def run_client_nochat_api_lean_morestuff(prompt, prompt_type='human_bot', max_ne
         iinput_nochat='',
         langchain_mode='Disabled',
         langchain_action=LangChainAction.QUERY.value,
         top_k_docs=4,
         document_subset=DocumentChoices.Relevant.name,
         document_choice=[],
@@ -225,23 +228,30 @@ def run_client_nochat_api_lean_morestuff(prompt, prompt_type='human_bot', max_ne
 @pytest.mark.skip(reason="For manual use against some server, no server launched")
 def test_client_chat(prompt_type='human_bot'):
     return run_client_chat(prompt='Who are you?', prompt_type=prompt_type, stream_output=False, max_new_tokens=50,
-                           langchain_mode='Disabled', langchain_action=LangChainAction.QUERY.value)
 @pytest.mark.skip(reason="For manual use against some server, no server launched")
 def test_client_chat_stream(prompt_type='human_bot'):
     return run_client_chat(prompt="Tell a very long kid's story about birds.", prompt_type=prompt_type,
                            stream_output=True, max_new_tokens=512,
-                           langchain_mode='Disabled', langchain_action=LangChainAction.QUERY.value)
-def run_client_chat(prompt, prompt_type, stream_output, max_new_tokens, langchain_mode, langchain_action,
                     prompt_dict=None):
     client = get_client(serialize=False)
     kwargs, args = get_args(prompt, prompt_type, chat=True, stream_output=stream_output,
-                            max_new_tokens=max_new_tokens, langchain_mode=langchain_mode,
                             langchain_action=langchain_action,
                             prompt_dict=prompt_dict)
     return run_client(client, prompt, args, kwargs)
@@ -285,15 +295,18 @@ def run_client(client, prompt, args, kwargs, do_md_to_text=True, verbose=False):
 def test_client_nochat_stream(prompt_type='human_bot'):
     return run_client_nochat_gen(prompt="Tell a very long kid's story about birds.", prompt_type=prompt_type,
                                  stream_output=True, max_new_tokens=512,
-                                 langchain_mode='Disabled', langchain_action=LangChainAction.QUERY.value)
-def run_client_nochat_gen(prompt, prompt_type, stream_output, max_new_tokens, langchain_mode, langchain_action):
     client = get_client(serialize=False)
     kwargs, args = get_args(prompt, prompt_type, chat=False, stream_output=stream_output,
                             max_new_tokens=max_new_tokens, langchain_mode=langchain_mode,
-                            langchain_action=langchain_action)
     return run_client_gen(client, prompt, args, kwargs)

              top_k_docs=3,
              langchain_mode='Disabled',
              langchain_action=LangChainAction.QUERY.value,
+             langchain_agents=[],
              prompt_dict=None):
     from collections import OrderedDict
     kwargs = OrderedDict(instruction=prompt if chat else '',  # only for chat=True
                          iinput_nochat='',  # only for chat=False
                          langchain_mode=langchain_mode,
                          langchain_action=langchain_action,
+                         langchain_agents=langchain_agents,
                          top_k_docs=top_k_docs,
                          chunk=True,
                          chunk_size=512,
         iinput_nochat='',
         langchain_mode='Disabled',
         langchain_action=LangChainAction.QUERY.value,
+        langchain_agents=[],
         top_k_docs=4,
         document_subset=DocumentChoices.Relevant.name,
         document_choice=[],
 @pytest.mark.skip(reason="For manual use against some server, no server launched")
 def test_client_chat(prompt_type='human_bot'):
     return run_client_chat(prompt='Who are you?', prompt_type=prompt_type, stream_output=False, max_new_tokens=50,
+                           langchain_mode='Disabled',
+                           langchain_action=LangChainAction.QUERY.value,
+                           langchain_agents=[])
 @pytest.mark.skip(reason="For manual use against some server, no server launched")
 def test_client_chat_stream(prompt_type='human_bot'):
     return run_client_chat(prompt="Tell a very long kid's story about birds.", prompt_type=prompt_type,
                            stream_output=True, max_new_tokens=512,
+                           langchain_mode='Disabled',
+                           langchain_action=LangChainAction.QUERY.value,
+                           langchain_agents=[])
+def run_client_chat(prompt, prompt_type, stream_output, max_new_tokens,
+                    langchain_mode, langchain_action, langchain_agents,
                     prompt_dict=None):
     client = get_client(serialize=False)
     kwargs, args = get_args(prompt, prompt_type, chat=True, stream_output=stream_output,
+                            max_new_tokens=max_new_tokens,
+                            langchain_mode=langchain_mode,
                             langchain_action=langchain_action,
+                            langchain_agents=langchain_agents,
                             prompt_dict=prompt_dict)
     return run_client(client, prompt, args, kwargs)
 def test_client_nochat_stream(prompt_type='human_bot'):
     return run_client_nochat_gen(prompt="Tell a very long kid's story about birds.", prompt_type=prompt_type,
                                  stream_output=True, max_new_tokens=512,
+                                 langchain_mode='Disabled',
+                                 langchain_action=LangChainAction.QUERY.value,
+                                 langchain_agents=[])
+def run_client_nochat_gen(prompt, prompt_type, stream_output, max_new_tokens,
+                          langchain_mode, langchain_action, langchain_agents):
     client = get_client(serialize=False)
     kwargs, args = get_args(prompt, prompt_type, chat=False, stream_output=stream_output,
                             max_new_tokens=max_new_tokens, langchain_mode=langchain_mode,
+                            langchain_action=langchain_action, langchain_agents=langchain_agents)
     return run_client_gen(client, prompt, args, kwargs)

enums.py CHANGED Viewed

@@ -31,6 +31,7 @@ class PromptType(Enum):
     mptinstruct = 25
     mptchat = 26
     falcon = 27
 class DocumentChoices(Enum):
@@ -71,6 +72,13 @@ class LangChainAction(Enum):
     SUMMARIZE_REFINE = "Summarize_refine"
 no_server_str = no_lora_str = no_model_str = '[None/Remove]'
 # from site-packages/langchain/llms/openai.py

     mptinstruct = 25
     mptchat = 26
     falcon = 27
+    guanaco = 28
 class DocumentChoices(Enum):
     SUMMARIZE_REFINE = "Summarize_refine"
+class LangChainAgent(Enum):
+    """LangChain agents"""
+    SEARCH = "Search"
+    # CSV = "csv"  # WIP
 no_server_str = no_lora_str = no_model_str = '[None/Remove]'
 # from site-packages/langchain/llms/openai.py

evaluate_params.py CHANGED Viewed

@@ -31,6 +31,7 @@ eval_func_param_names = ['instruction',
                          'iinput_nochat',
                          'langchain_mode',
                          'langchain_action',
                          'top_k_docs',
                          'chunk',
                          'chunk_size',

                          'iinput_nochat',
                          'langchain_mode',
                          'langchain_action',
+                         'langchain_agents',
                          'top_k_docs',
                          'chunk',
                          'chunk_size',

gen.py CHANGED Viewed

@@ -29,11 +29,11 @@ warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is
 from evaluate_params import eval_func_param_names, no_default_param_names
 from enums import DocumentChoices, LangChainMode, no_lora_str, model_token_mapping, no_model_str, source_prefix, \
-    source_postfix, LangChainAction
 from loaders import get_loaders
 from utils import set_seed, clear_torch_cache, save_generate_output, NullContext, wrapped_partial, EThread, get_githash, \
     import_matplotlib, get_device, makedirs, get_kwargs, start_faulthandler, get_hf_server, FakeTokenizer, remove, \
-    have_langchain
 start_faulthandler()
 import_matplotlib()
@@ -54,6 +54,8 @@ langchain_modes = [x.value for x in list(LangChainMode)]
 langchain_actions = [x.value for x in list(LangChainAction)]
 scratch_base_dir = '/tmp/'
@@ -134,7 +136,7 @@ def main(
         extra_lora_options: typing.List[str] = [],
         extra_server_options: typing.List[str] = [],
-        score_model: str = 'OpenAssistant/reward-model-deberta-v3-large-v2',
         eval_filename: str = None,
         eval_prompts_only_num: int = 0,
@@ -143,15 +145,18 @@ def main(
         langchain_mode: str = None,
         langchain_action: str = LangChainAction.QUERY.value,
         force_langchain_evaluate: bool = False,
         visible_langchain_modes: list = ['UserData', 'MyData'],
         # WIP:
         # visible_langchain_actions: list = langchain_actions.copy(),
         visible_langchain_actions: list = [LangChainAction.QUERY.value, LangChainAction.SUMMARIZE_MAP.value],
         document_subset: str = DocumentChoices.Relevant.name,
         document_choice: list = [],
         user_path: str = None,
         detect_user_path_changes_every_query: bool = False,
         load_db_if_exists: bool = True,
         keep_sources_in_context: bool = False,
         db_type: str = 'chroma',
@@ -196,6 +201,8 @@ def main(
                              Or Address can be "openai_chat" or "openai" for OpenAI API
                              e.g. python generate.py --inference_server="openai_chat" --base_model=gpt-3.5-turbo
                              e.g. python generate.py --inference_server="openai" --base_model=text-davinci-003
     :param prompt_type: type of prompt, usually matched to fine-tuned model or plain for foundational model
     :param prompt_dict: If prompt_type=custom, then expects (some) items returned by get_prompt(..., return_dict=True)
     :param model_lock: Lock models to specific combinations, for ease of use and extending to many models
@@ -271,18 +278,24 @@ def main(
     :param extra_model_options: extra models to show in list in gradio
     :param extra_lora_options: extra LORA to show in list in gradio
     :param extra_server_options: extra servers to show in list in gradio
-    :param score_model: which model to score responses (None means no scoring)
     :param eval_filename: json file to use for evaluation, if None is sharegpt
     :param eval_prompts_only_num: for no gradio benchmark, if using eval_filename prompts for eval instead of examples
     :param eval_prompts_only_seed: for no gradio benchmark, seed for eval_filename sampling
     :param eval_as_output: for no gradio benchmark, whether to test eval_filename output itself
     :param langchain_mode: Data source to include.  Choose "UserData" to only consume files from make_db.py.
            WARNING: wiki_full requires extra data processing via read_wiki_full.py and requires really good workstation to generate db, unless already present.
     :param langchain_action: Mode langchain operations in on documents.
             Query: Make query of document(s)
             Summarize or Summarize_map_reduce: Summarize document(s) via map_reduce
             Summarize_all: Summarize document(s) using entire document at once
             Summarize_refine: Summarize document(s) using entire document, and try to refine before returning summary
     :param force_langchain_evaluate: Whether to force langchain LLM use even if not doing langchain, mostly for testing.
     :param user_path: user path to glob from to generate db for vector search, for 'UserData' langchain mode.
            If already have db, any new/changed files are added automatically if path set, does not have to be same path used for prior db sources
@@ -293,17 +306,18 @@ def main(
            But wiki_full is expensive and requires preparation
            To allow scratch space only live in session, add 'MyData' to list
            Default: If only want to consume local files, e.g. prepared by make_db.py, only include ['UserData']
-           FIXME: Avoid 'All' for now, not implemented
     :param visible_langchain_actions: Which actions to allow
     :param document_subset: Default document choice when taking subset of collection
     :param document_choice: Chosen document(s) by internal name
     :param load_db_if_exists: Whether to load chroma db if exists or re-generate db
     :param keep_sources_in_context: Whether to keep url sources in context, not helpful usually
     :param db_type: 'faiss' for in-memory or 'chroma' or 'weaviate' for persisted on disk
     :param use_openai_embedding: Whether to use OpenAI embeddings for vector db
     :param use_openai_model: Whether to use OpenAI model for use with vector db
     :param hf_embedding_model: Which HF embedding model to use for vector db
-           Default is instructor-large with 768 parameters per embedding if have GPUs, else all-MiniLM-L6-v1 if no GPUs
            Can also choose simpler model with 384 parameters per embedding: "sentence-transformers/all-MiniLM-L6-v2"
            Can also choose even better embedding with 1024 parameters: 'hkunlp/instructor-xl'
            We support automatically changing of embeddings for chroma, with a backup of db made if this is done
@@ -327,6 +341,7 @@ def main(
            captions_model: str = "Salesforce/blip2-flan-t5-xl",   # question/answer capable, 16GB state
            captions_model: str = "Salesforce/blip2-flan-t5-xxl",  # question/answer capable, 60GB state
            Note: opt-based blip2 are not permissive license due to opt and Meta license restrictions
     :param pre_load_caption_model: Whether to preload caption model, or load after forking parallel doc loader
            parallel loading disabled if preload and have images, to prevent deadlocking on cuda context
            Recommended if using larger caption model
@@ -394,6 +409,8 @@ def main(
             visible_langchain_modes += [langchain_mode]
     assert langchain_action in langchain_actions, "Invalid langchain_action %s" % langchain_action
     # if specifically chose not to show My or User Data, disable upload, so gradio elements are simpler
     if LangChainMode.MY_DATA.value not in visible_langchain_modes:
@@ -413,7 +430,8 @@ def main(
                   " set user_path and ensure allow_upload_to_user_data=True" % langchain_mode, flush=True)
         else:
             raise RuntimeError("Please pass --langchain_mode=<chosen mode> out of %s" % langchain_modes)
-    if not have_langchain and langchain_mode not in [None, LangChainMode.DISABLED.value, LangChainMode.LLM.value, LangChainMode.CHAT_LLM.value]:
         raise RuntimeError("Asked for LangChain mode but langchain python package cannot be found.")
     if langchain_mode is None:
         # if not set yet, disable
@@ -474,7 +492,7 @@ def main(
         # HF accounted for later in get_max_max_new_tokens()
     save_dir = os.getenv('SAVE_DIR', save_dir)
     score_model = os.getenv('SCORE_MODEL', score_model)
-    if score_model == 'None' or score_model is None:
         score_model = ''
     concurrency_count = int(os.getenv('CONCURRENCY_COUNT', concurrency_count))
     api_open = bool(int(os.getenv('API_OPEN', str(int(api_open)))))
@@ -482,6 +500,7 @@ def main(
     n_gpus = torch.cuda.device_count() if torch.cuda.is_available else 0
     if n_gpus == 0:
         gpu_id = None
         load_8bit = False
         load_4bit = False
@@ -499,7 +518,11 @@ def main(
         if hf_embedding_model is None:
             # if no GPUs, use simpler embedding model to avoid cost in time
             hf_embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
     else:
         if hf_embedding_model is None:
             # if still None, then set default
             hf_embedding_model = 'hkunlp/instructor-large'
@@ -967,11 +990,13 @@ def get_model(
         client = gr_client or hf_client
         # Don't return None, None for model, tokenizer so triggers
         return client, tokenizer, 'http'
-    if isinstance(inference_server, str) and inference_server.startswith('openai'):
-        assert os.getenv('OPENAI_API_KEY'), "Set environment for OPENAI_API_KEY"
-        # Don't return None, None for model, tokenizer so triggers
-        # include small token cushion
-        tokenizer = FakeTokenizer(model_max_length=model_token_mapping[base_model] - 50)
         return inference_server, tokenizer, inference_server
     assert not inference_server, "Malformed inference_server=%s" % inference_server
     if base_model in non_hf_types:
@@ -1278,6 +1303,7 @@ def evaluate(
         iinput_nochat,
         langchain_mode,
         langchain_action,
         top_k_docs,
         chunk,
         chunk_size,
@@ -1298,6 +1324,7 @@ def evaluate(
         raise_generate_gpu_exceptions=None,
         chat_context=None,
         lora_weights=None,
         load_db_if_exists=True,
         dbs=None,
         user_path=None,
@@ -1452,6 +1479,8 @@ def evaluate(
     # THIRD PLACE where LangChain referenced, but imports only occur if enabled and have db to use
     assert langchain_mode in langchain_modes, "Invalid langchain_mode %s" % langchain_mode
     assert langchain_action in langchain_actions, "Invalid langchain_action %s" % langchain_action
     if langchain_mode in ['MyData'] and my_db_state is not None and len(my_db_state) > 0 and my_db_state[0] is not None:
         db1 = my_db_state[0]
     elif dbs is not None and langchain_mode in dbs:
@@ -1484,6 +1513,7 @@ def evaluate(
                            inference_server=inference_server,
                            stream_output=stream_output,
                            prompter=prompter,
                            load_db_if_exists=load_db_if_exists,
                            db=db1,
                            user_path=user_path,
@@ -1498,6 +1528,7 @@ def evaluate(
                            chunk_size=chunk_size,
                            langchain_mode=langchain_mode,
                            langchain_action=langchain_action,
                            document_subset=document_subset,
                            document_choice=document_choice,
                            db_type=db_type,
@@ -1526,6 +1557,7 @@ def evaluate(
                               inference_server=inference_server,
                               langchain_mode=langchain_mode,
                               langchain_action=langchain_action,
                               document_subset=document_subset,
                               document_choice=document_choice,
                               num_prompt_tokens=num_prompt_tokens,
@@ -1549,12 +1581,12 @@ def evaluate(
             clear_torch_cache()
             return
-    if inference_server.startswith('openai') or inference_server.startswith('http'):
-        if inference_server.startswith('openai'):
-            import openai
             where_from = "openai_client"
-            openai.api_key = os.getenv("OPENAI_API_KEY")
             terminate_response = prompter.terminate_response or []
             stop_sequences = list(set(terminate_response + [prompter.PreResponse]))
             stop_sequences = [x for x in stop_sequences if x]
@@ -1567,7 +1599,7 @@ def evaluate(
                                      n=num_return_sequences,
                                      presence_penalty=1.07 - repetition_penalty + 0.6,  # so good default
                                      )
-            if inference_server == 'openai':
                 response = openai.Completion.create(
                     model=base_model,
                     prompt=prompt,
@@ -1590,7 +1622,9 @@ def evaluate(
                         yield dict(response=prompter.get_response(prompt + text, prompt=prompt,
                                                                   sanitize_bot_response=sanitize_bot_response),
                                    sources='')
-            elif inference_server == 'openai_chat':
                 response = openai.ChatCompletion.create(
                     model=base_model,
                     messages=[
@@ -1643,6 +1677,7 @@ def evaluate(
                 where_from = "gr_client"
                 client_langchain_mode = 'Disabled'
                 client_langchain_action = LangChainAction.QUERY.value
                 gen_server_kwargs = dict(temperature=temperature,
                                          top_p=top_p,
                                          top_k=top_k,
@@ -1695,6 +1730,7 @@ def evaluate(
                                      iinput_nochat=gr_iinput,  # only for chat=False
                                      langchain_mode=client_langchain_mode,
                                      langchain_action=client_langchain_action,
                                      top_k_docs=top_k_docs,
                                      chunk=chunk,
                                      chunk_size=chunk_size,
@@ -2276,8 +2312,8 @@ y = np.random.randint(0, 1, 100)
     # move to correct position
     for example in examples:
-        example += [chat, '', '', LangChainMode.DISABLED.value, LangChainAction.QUERY.value,
-                    top_k_docs, chunk, chunk_size, [DocumentChoices.Relevant.name], []
                     ]
         # adjust examples if non-chat mode
         if not chat:
@@ -2383,14 +2419,14 @@ def check_locals(**kwargs):
 def get_model_max_length(model_state):
-    if not isinstance(model_state['tokenizer'], (str, types.NoneType)):
         return model_state['tokenizer'].model_max_length
     else:
         return 2048
 def get_max_max_new_tokens(model_state, **kwargs):
-    if not isinstance(model_state['tokenizer'], (str, types.NoneType)):
         max_max_new_tokens = model_state['tokenizer'].model_max_length
     else:
         max_max_new_tokens = None

 from evaluate_params import eval_func_param_names, no_default_param_names
 from enums import DocumentChoices, LangChainMode, no_lora_str, model_token_mapping, no_model_str, source_prefix, \
+    source_postfix, LangChainAction, LangChainAgent
 from loaders import get_loaders
 from utils import set_seed, clear_torch_cache, save_generate_output, NullContext, wrapped_partial, EThread, get_githash, \
     import_matplotlib, get_device, makedirs, get_kwargs, start_faulthandler, get_hf_server, FakeTokenizer, remove, \
+    have_langchain, set_openai
 start_faulthandler()
 import_matplotlib()
 langchain_actions = [x.value for x in list(LangChainAction)]
+langchain_agents_list = [x.value for x in list(LangChainAgent)]
 scratch_base_dir = '/tmp/'
         extra_lora_options: typing.List[str] = [],
         extra_server_options: typing.List[str] = [],
+        score_model: str = 'auto',
         eval_filename: str = None,
         eval_prompts_only_num: int = 0,
         langchain_mode: str = None,
         langchain_action: str = LangChainAction.QUERY.value,
+        langchain_agents: list = [],
         force_langchain_evaluate: bool = False,
         visible_langchain_modes: list = ['UserData', 'MyData'],
         # WIP:
         # visible_langchain_actions: list = langchain_actions.copy(),
         visible_langchain_actions: list = [LangChainAction.QUERY.value, LangChainAction.SUMMARIZE_MAP.value],
+        visible_langchain_agents: list = langchain_agents_list.copy(),
         document_subset: str = DocumentChoices.Relevant.name,
         document_choice: list = [],
         user_path: str = None,
         detect_user_path_changes_every_query: bool = False,
+        use_llm_if_no_docs: bool = False,
         load_db_if_exists: bool = True,
         keep_sources_in_context: bool = False,
         db_type: str = 'chroma',
                              Or Address can be "openai_chat" or "openai" for OpenAI API
                              e.g. python generate.py --inference_server="openai_chat" --base_model=gpt-3.5-turbo
                              e.g. python generate.py --inference_server="openai" --base_model=text-davinci-003
+                             Or Address can be "vllm:IP:port" or "vllm:IP:port" for OpenAI-compliant vLLM endpoint
+                             Note: vllm_chat not supported by vLLM project.
     :param prompt_type: type of prompt, usually matched to fine-tuned model or plain for foundational model
     :param prompt_dict: If prompt_type=custom, then expects (some) items returned by get_prompt(..., return_dict=True)
     :param model_lock: Lock models to specific combinations, for ease of use and extending to many models
     :param extra_model_options: extra models to show in list in gradio
     :param extra_lora_options: extra LORA to show in list in gradio
     :param extra_server_options: extra servers to show in list in gradio
+    :param score_model: which model to score responses
+           None: no response scoring
+           'auto': auto mode, '' (no model) for CPU, 'OpenAssistant/reward-model-deberta-v3-large-v2' for GPU,
+            because on CPU takes too much compute just for scoring response
     :param eval_filename: json file to use for evaluation, if None is sharegpt
     :param eval_prompts_only_num: for no gradio benchmark, if using eval_filename prompts for eval instead of examples
     :param eval_prompts_only_seed: for no gradio benchmark, seed for eval_filename sampling
     :param eval_as_output: for no gradio benchmark, whether to test eval_filename output itself
     :param langchain_mode: Data source to include.  Choose "UserData" to only consume files from make_db.py.
+           None: auto mode, check if langchain package exists, at least do ChatLLM if so, else Disabled
            WARNING: wiki_full requires extra data processing via read_wiki_full.py and requires really good workstation to generate db, unless already present.
     :param langchain_action: Mode langchain operations in on documents.
             Query: Make query of document(s)
             Summarize or Summarize_map_reduce: Summarize document(s) via map_reduce
             Summarize_all: Summarize document(s) using entire document at once
             Summarize_refine: Summarize document(s) using entire document, and try to refine before returning summary
+    :param langchain_agents: Which agents to use
+            'search': Use Web Search as context for LLM response, e.g. SERP if have SERPAPI_API_KEY in env
     :param force_langchain_evaluate: Whether to force langchain LLM use even if not doing langchain, mostly for testing.
     :param user_path: user path to glob from to generate db for vector search, for 'UserData' langchain mode.
            If already have db, any new/changed files are added automatically if path set, does not have to be same path used for prior db sources
            But wiki_full is expensive and requires preparation
            To allow scratch space only live in session, add 'MyData' to list
            Default: If only want to consume local files, e.g. prepared by make_db.py, only include ['UserData']
     :param visible_langchain_actions: Which actions to allow
+    :param visible_langchain_agents: Which agents to allow
     :param document_subset: Default document choice when taking subset of collection
     :param document_choice: Chosen document(s) by internal name
+    :param use_llm_if_no_docs: Whether to use LLM even if no documents, when langchain_mode=UserData or MyData
     :param load_db_if_exists: Whether to load chroma db if exists or re-generate db
     :param keep_sources_in_context: Whether to keep url sources in context, not helpful usually
     :param db_type: 'faiss' for in-memory or 'chroma' or 'weaviate' for persisted on disk
     :param use_openai_embedding: Whether to use OpenAI embeddings for vector db
     :param use_openai_model: Whether to use OpenAI model for use with vector db
     :param hf_embedding_model: Which HF embedding model to use for vector db
+           Default is instructor-large with 768 parameters per embedding if have GPUs, else all-MiniLM-L6-v2 if no GPUs
            Can also choose simpler model with 384 parameters per embedding: "sentence-transformers/all-MiniLM-L6-v2"
            Can also choose even better embedding with 1024 parameters: 'hkunlp/instructor-xl'
            We support automatically changing of embeddings for chroma, with a backup of db made if this is done
            captions_model: str = "Salesforce/blip2-flan-t5-xl",   # question/answer capable, 16GB state
            captions_model: str = "Salesforce/blip2-flan-t5-xxl",  # question/answer capable, 60GB state
            Note: opt-based blip2 are not permissive license due to opt and Meta license restrictions
+           Disabled for CPU since BLIP requires CUDA
     :param pre_load_caption_model: Whether to preload caption model, or load after forking parallel doc loader
            parallel loading disabled if preload and have images, to prevent deadlocking on cuda context
            Recommended if using larger caption model
             visible_langchain_modes += [langchain_mode]
     assert langchain_action in langchain_actions, "Invalid langchain_action %s" % langchain_action
+    assert len(
+        set(langchain_agents).difference(langchain_agents_list)) == 0, "Invalid langchain_agents %s" % langchain_agents
     # if specifically chose not to show My or User Data, disable upload, so gradio elements are simpler
     if LangChainMode.MY_DATA.value not in visible_langchain_modes:
                   " set user_path and ensure allow_upload_to_user_data=True" % langchain_mode, flush=True)
         else:
             raise RuntimeError("Please pass --langchain_mode=<chosen mode> out of %s" % langchain_modes)
+    if not have_langchain and langchain_mode not in [None, LangChainMode.DISABLED.value, LangChainMode.LLM.value,
+                                                     LangChainMode.CHAT_LLM.value]:
         raise RuntimeError("Asked for LangChain mode but langchain python package cannot be found.")
     if langchain_mode is None:
         # if not set yet, disable
         # HF accounted for later in get_max_max_new_tokens()
     save_dir = os.getenv('SAVE_DIR', save_dir)
     score_model = os.getenv('SCORE_MODEL', score_model)
+    if str(score_model) == 'None':
         score_model = ''
     concurrency_count = int(os.getenv('CONCURRENCY_COUNT', concurrency_count))
     api_open = bool(int(os.getenv('API_OPEN', str(int(api_open)))))
     n_gpus = torch.cuda.device_count() if torch.cuda.is_available else 0
     if n_gpus == 0:
+        enable_captions = False
         gpu_id = None
         load_8bit = False
         load_4bit = False
         if hf_embedding_model is None:
             # if no GPUs, use simpler embedding model to avoid cost in time
             hf_embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
+        if score_model == 'auto':
+            score_model = ''
     else:
+        if score_model == 'auto':
+            score_model = 'OpenAssistant/reward-model-deberta-v3-large-v2'
         if hf_embedding_model is None:
             # if still None, then set default
             hf_embedding_model = 'hkunlp/instructor-large'
         client = gr_client or hf_client
         # Don't return None, None for model, tokenizer so triggers
         return client, tokenizer, 'http'
+    if isinstance(inference_server, str) and (
+            inference_server.startswith('openai') or inference_server.startswith('vllm')):
+        if inference_server.startswith('openai'):
+            assert os.getenv('OPENAI_API_KEY'), "Set environment for OPENAI_API_KEY"
+            # Don't return None, None for model, tokenizer so triggers
+            # include small token cushion
+            tokenizer = FakeTokenizer(model_max_length=model_token_mapping[base_model] - 50)
         return inference_server, tokenizer, inference_server
     assert not inference_server, "Malformed inference_server=%s" % inference_server
     if base_model in non_hf_types:
         iinput_nochat,
         langchain_mode,
         langchain_action,
+        langchain_agents,
         top_k_docs,
         chunk,
         chunk_size,
         raise_generate_gpu_exceptions=None,
         chat_context=None,
         lora_weights=None,
+        use_llm_if_no_docs=False,
         load_db_if_exists=True,
         dbs=None,
         user_path=None,
     # THIRD PLACE where LangChain referenced, but imports only occur if enabled and have db to use
     assert langchain_mode in langchain_modes, "Invalid langchain_mode %s" % langchain_mode
     assert langchain_action in langchain_actions, "Invalid langchain_action %s" % langchain_action
+    assert len(
+        set(langchain_agents).difference(langchain_agents_list)) == 0, "Invalid langchain_agents %s" % langchain_agents
     if langchain_mode in ['MyData'] and my_db_state is not None and len(my_db_state) > 0 and my_db_state[0] is not None:
         db1 = my_db_state[0]
     elif dbs is not None and langchain_mode in dbs:
                            inference_server=inference_server,
                            stream_output=stream_output,
                            prompter=prompter,
+                           use_llm_if_no_docs=use_llm_if_no_docs,
                            load_db_if_exists=load_db_if_exists,
                            db=db1,
                            user_path=user_path,
                            chunk_size=chunk_size,
                            langchain_mode=langchain_mode,
                            langchain_action=langchain_action,
+                           langchain_agents=langchain_agents,
                            document_subset=document_subset,
                            document_choice=document_choice,
                            db_type=db_type,
                               inference_server=inference_server,
                               langchain_mode=langchain_mode,
                               langchain_action=langchain_action,
+                              langchain_agents=langchain_agents,
                               document_subset=document_subset,
                               document_choice=document_choice,
                               num_prompt_tokens=num_prompt_tokens,
             clear_torch_cache()
             return
+    if inference_server.startswith('vllm') or inference_server.startswith('openai') or inference_server.startswith(
+            'http'):
+        if inference_server.startswith('vllm') or inference_server.startswith('openai'):
             where_from = "openai_client"
+            openai, inf_type = set_openai(inference_server)
             terminate_response = prompter.terminate_response or []
             stop_sequences = list(set(terminate_response + [prompter.PreResponse]))
             stop_sequences = [x for x in stop_sequences if x]
                                      n=num_return_sequences,
                                      presence_penalty=1.07 - repetition_penalty + 0.6,  # so good default
                                      )
+            if inf_type == 'vllm' or inference_server == 'openai':
                 response = openai.Completion.create(
                     model=base_model,
                     prompt=prompt,
                         yield dict(response=prompter.get_response(prompt + text, prompt=prompt,
                                                                   sanitize_bot_response=sanitize_bot_response),
                                    sources='')
+            elif inf_type == 'vllm_chat' or inference_server == 'openai_chat':
+                if inf_type == 'vllm_chat':
+                    raise NotImplementedError('%s not supported by vLLM' % inf_type)
                 response = openai.ChatCompletion.create(
                     model=base_model,
                     messages=[
                 where_from = "gr_client"
                 client_langchain_mode = 'Disabled'
                 client_langchain_action = LangChainAction.QUERY.value
+                client_langchain_agents = []
                 gen_server_kwargs = dict(temperature=temperature,
                                          top_p=top_p,
                                          top_k=top_k,
                                      iinput_nochat=gr_iinput,  # only for chat=False
                                      langchain_mode=client_langchain_mode,
                                      langchain_action=client_langchain_action,
+                                     langchain_agents=client_langchain_agents,
                                      top_k_docs=top_k_docs,
                                      chunk=chunk,
                                      chunk_size=chunk_size,
     # move to correct position
     for example in examples:
+        example += [chat, '', '', LangChainMode.DISABLED.value, LangChainAction.QUERY.value, [],
+                    top_k_docs, chunk, chunk_size, DocumentChoices.Relevant.name, []
                     ]
         # adjust examples if non-chat mode
         if not chat:
 def get_model_max_length(model_state):
+    if not isinstance(model_state['tokenizer'], (str, type(None))):
         return model_state['tokenizer'].model_max_length
     else:
         return 2048
 def get_max_max_new_tokens(model_state, **kwargs):
+    if not isinstance(model_state['tokenizer'], (str, type(None))):
         max_max_new_tokens = model_state['tokenizer'].model_max_length
     else:
         max_max_new_tokens = None

gpt_langchain.py CHANGED Viewed

@@ -21,6 +21,7 @@ import filelock
 from joblib import delayed
 from langchain.callbacks import streaming_stdout
 from langchain.embeddings import HuggingFaceInstructEmbeddings
 from tqdm import tqdm
 from enums import DocumentChoices, no_lora_str, model_token_mapping, source_prefix, source_postfix, non_query_commands, \
@@ -30,7 +31,7 @@ from gen import get_model, SEED
 from prompter import non_hf_types, PromptType, Prompter
 from utils import wrapped_partial, EThread, import_matplotlib, sanitize_filename, makedirs, get_url, flatten_list, \
     get_device, ProgressParallel, remove, hash_file, clear_torch_cache, NullContext, get_hf_server, FakeTokenizer, \
-    have_libreoffice, have_arxiv, have_playwright, have_selenium, have_tesseract, have_pymupdf
 from utils_langchain import StreamingGradioCallbackHandler
 import_matplotlib()
@@ -276,15 +277,7 @@ from typing import Any, Dict, List, Optional, Set
 from pydantic import Extra, Field, root_validator
-from langchain.callbacks.manager import CallbackManagerForLLMRun
-"""Wrapper around Huggingface text generation inference API."""
-from functools import partial
-from typing import Any, Dict, List, Optional
-from pydantic import Extra, Field, root_validator
-from langchain.callbacks.manager import CallbackManagerForLLMRun
 from langchain.llms.base import LLM
@@ -356,6 +349,7 @@ class GradioInference(LLM):
         gr_client = self.client
         client_langchain_mode = 'Disabled'
         client_langchain_action = LangChainAction.QUERY.value
         top_k_docs = 1
         chunk = True
         chunk_size = 512
@@ -385,6 +379,7 @@ class GradioInference(LLM):
                              iinput_nochat='',  # only for chat=False
                              langchain_mode=client_langchain_mode,
                              langchain_action=client_langchain_action,
                              top_k_docs=top_k_docs,
                              chunk=chunk,
                              chunk_size=chunk_size,
@@ -566,6 +561,92 @@ class H2OHuggingFaceTextGenInference(HuggingFaceTextGenInference):
 from langchain.chat_models import ChatOpenAI
 class H2OChatOpenAI(ChatOpenAI):
@@ -599,14 +680,26 @@ def get_llm(use_openai_model=False,
             sanitize_bot_response=False,
             verbose=False,
             ):
-    if use_openai_model or inference_server in ['openai', 'openai_chat']:
         if use_openai_model and model_name is None:
             model_name = "gpt-3.5-turbo"
-        if inference_server == 'openai':
-            from langchain.llms import OpenAI
-            cls = OpenAI
-        else:
             cls = H2OChatOpenAI
         callbacks = [StreamingGradioCallbackHandler()]
         llm = cls(model_name=model_name,
                   temperature=temperature if do_sample else 0,
@@ -616,11 +709,18 @@ def get_llm(use_openai_model=False,
                   frequency_penalty=0,
                   presence_penalty=1.07 - repetition_penalty + 0.6,  # so good default
                   callbacks=callbacks if stream_output else None,
                   )
         streamer = callbacks[0] if stream_output else None
         if inference_server in ['openai', 'openai_chat']:
             prompt_type = inference_server
         else:
             prompt_type = prompt_type or 'plain'
     elif inference_server:
         assert inference_server.startswith(
@@ -916,7 +1016,6 @@ def get_dai_docs(from_hf=False, get_pickle=True):
     return sources
 image_types = ["png", "jpg", "jpeg"]
 non_image_types = ["pdf", "txt", "csv", "toml", "py", "rst", "rtf",
                    "md",
@@ -927,7 +1026,8 @@ non_image_types = ["pdf", "txt", "csv", "toml", "py", "rst", "rtf",
                    ]
 # "msg",  GPL3
-if have_libreoffice:
     non_image_types.extend(["docx", "doc", "xls", "xlsx"])
 file_types = non_image_types + image_types
@@ -936,9 +1036,11 @@ file_types = non_image_types + image_types
 def add_meta(docs1, file):
     file_extension = pathlib.Path(file).suffix
     hashid = hash_file(file)
     if not isinstance(docs1, (list, tuple, types.GeneratorType)):
         docs1 = [docs1]
-    [x.metadata.update(dict(input_type=file_extension, date=str(datetime.now()), hashid=hashid)) for x in docs1]
 def file_to_doc(file, base_path=None, verbose=False, fail_any_exception=False,
@@ -1011,11 +1113,11 @@ def file_to_doc(file, base_path=None, verbose=False, fail_any_exception=False,
         add_meta(docs1, file)
         docs1 = clean_doc(docs1)
         doc1 = chunk_sources(docs1, chunk=chunk, chunk_size=chunk_size, language=Language.HTML)
-    elif (file.lower().endswith('.docx') or file.lower().endswith('.doc')) and have_libreoffice:
         docs1 = UnstructuredWordDocumentLoader(file_path=file).load()
         add_meta(docs1, file)
         doc1 = chunk_sources(docs1, chunk=chunk, chunk_size=chunk_size)
-    elif (file.lower().endswith('.xlsx') or file.lower().endswith('.xls')) and have_libreoffice:
         docs1 = UnstructuredExcelLoader(file_path=file).load()
         add_meta(docs1, file)
         doc1 = chunk_sources(docs1, chunk=chunk, chunk_size=chunk_size)
@@ -1760,6 +1862,7 @@ def _run_qa_db(query=None,
                cut_distanct=1.1,
                sanitize_bot_response=False,
                show_rank=False,
                load_db_if_exists=False,
                db=None,
                do_sample=False,
@@ -1775,6 +1878,7 @@ def _run_qa_db(query=None,
                num_return_sequences=1,
                langchain_mode=None,
                langchain_action=None,
                document_subset=DocumentChoices.Relevant.name,
                document_choice=[],
                n_jobs=-1,
@@ -1857,20 +1961,21 @@ def _run_qa_db(query=None,
         formatted_doc_chunks = '\n\n'.join([get_url(x) + '\n\n' + x.page_content for x in docs])
         yield formatted_doc_chunks, ''
         return
-    if not docs and langchain_action in [LangChainAction.SUMMARIZE_MAP.value,
-                                         LangChainAction.SUMMARIZE_ALL.value,
-                                         LangChainAction.SUMMARIZE_REFINE.value]:
-        ret = 'No relevant documents to summarize.' if have_any_docs else 'No documents to summarize.'
-        extra = ''
-        yield ret, extra
-        return
-    if not docs and langchain_mode not in [LangChainMode.DISABLED.value,
-                                           LangChainMode.CHAT_LLM.value,
-                                           LangChainMode.LLM.value]:
-        ret = 'No relevant documents to query.' if have_any_docs else 'No documents to query.'
-        extra = ''
-        yield ret, extra
-        return
     if chain is None and model_name not in non_hf_types:
         # here if no docs at all and not HF type
@@ -1948,6 +2053,7 @@ def get_chain(query=None,
               db=None,
               langchain_mode=None,
               langchain_action=None,
               document_subset=DocumentChoices.Relevant.name,
               document_choice=[],
               n_jobs=-1,
@@ -1961,6 +2067,7 @@ def get_chain(query=None,
               auto_reduce_chunks=True,
               max_chunks=100,
               ):
     # determine whether use of context out of docs is planned
     if not use_openai_model and prompt_type not in ['plain'] or model_name in non_hf_types:
         if langchain_mode in ['Disabled', 'ChatLLM', 'LLM']:
@@ -2092,8 +2199,8 @@ def get_chain(query=None,
                                for result in zip(db_documents, db_metadatas)]
             # order documents
-            doc_hashes = [x['doc_hash'] for x in db_metadatas]
-            doc_chunk_ids = [x['chunk_id'] for x in db_metadatas]
             docs_with_score = [x for _, _, x in
                                sorted(zip(doc_hashes, doc_chunk_ids, docs_with_score), key=lambda x: (x[0], x[1]))
                                ]
@@ -2302,6 +2409,7 @@ def clean_doc(docs1):
 def chunk_sources(sources, chunk=True, chunk_size=512, language=None):
     if not chunk:
         return sources
     if not isinstance(sources, (list, tuple, types.GeneratorType)) and not callable(sources):
         # if just one document
@@ -2320,8 +2428,7 @@ def chunk_sources(sources, chunk=True, chunk_size=512, language=None):
     source_chunks = splitter.split_documents(sources)
     # currently in order, but when pull from db won't be, so mark order and document by hash
-    doc_hash = str(uuid.uuid4())[:10]
-    [x.metadata.update(dict(doc_hash=doc_hash, chunk_id=chunk_id)) for chunk_id, x in enumerate(source_chunks)]
     return source_chunks

 from joblib import delayed
 from langchain.callbacks import streaming_stdout
 from langchain.embeddings import HuggingFaceInstructEmbeddings
+from langchain.schema import LLMResult
 from tqdm import tqdm
 from enums import DocumentChoices, no_lora_str, model_token_mapping, source_prefix, source_postfix, non_query_commands, \
 from prompter import non_hf_types, PromptType, Prompter
 from utils import wrapped_partial, EThread, import_matplotlib, sanitize_filename, makedirs, get_url, flatten_list, \
     get_device, ProgressParallel, remove, hash_file, clear_torch_cache, NullContext, get_hf_server, FakeTokenizer, \
+    have_libreoffice, have_arxiv, have_playwright, have_selenium, have_tesseract, have_pymupdf, set_openai
 from utils_langchain import StreamingGradioCallbackHandler
 import_matplotlib()
 from pydantic import Extra, Field, root_validator
+from langchain.callbacks.manager import CallbackManagerForLLMRun, Callbacks
 from langchain.llms.base import LLM
         gr_client = self.client
         client_langchain_mode = 'Disabled'
         client_langchain_action = LangChainAction.QUERY.value
+        client_langchain_agents = []
         top_k_docs = 1
         chunk = True
         chunk_size = 512
                              iinput_nochat='',  # only for chat=False
                              langchain_mode=client_langchain_mode,
                              langchain_action=client_langchain_action,
+                             langchain_agents=client_langchain_agents,
                              top_k_docs=top_k_docs,
                              chunk=chunk,
                              chunk_size=chunk_size,
 from langchain.chat_models import ChatOpenAI
+from langchain.llms import OpenAI
+from langchain.llms.openai import _streaming_response_template, completion_with_retry, _update_response, \
+    update_token_usage
+class H2OOpenAI(OpenAI):
+    """
+    New class to handle vLLM's use of OpenAI, no vllm_chat supported, so only need here
+    Handles prompting that OpenAI doesn't need, stopping as well
+    """
+    stop_sequences: Any = None
+    sanitize_bot_response: bool = False
+    prompter: Any = None
+    tokenizer: Any = None
+    @classmethod
+    def all_required_field_names(cls) -> Set:
+        all_required_field_names = super(OpenAI, cls).all_required_field_names()
+        all_required_field_names.update(
+            {'top_p', 'frequency_penalty', 'presence_penalty', 'stop_sequences', 'sanitize_bot_response', 'prompter',
+             'tokenizer'})
+        return all_required_field_names
+    def _generate(
+            self,
+            prompts: List[str],
+            stop: Optional[List[str]] = None,
+            run_manager: Optional[CallbackManagerForLLMRun] = None,
+            **kwargs: Any,
+    ) -> LLMResult:
+        stop = self.stop_sequences if not stop else self.stop_sequences + stop
+        # HF inference server needs control over input tokens
+        assert self.tokenizer is not None
+        from h2oai_pipeline import H2OTextGenerationPipeline
+        for prompti, prompt in enumerate(prompts):
+            prompt, num_prompt_tokens = H2OTextGenerationPipeline.limit_prompt(prompt, self.tokenizer)
+            # NOTE: OpenAI/vLLM server does not add prompting, so must do here
+            data_point = dict(context='', instruction=prompt, input='')
+            prompt = self.prompter.generate_prompt(data_point)
+            prompts[prompti] = prompt
+        params = self._invocation_params
+        params = {**params, **kwargs}
+        sub_prompts = self.get_sub_prompts(params, prompts, stop)
+        choices = []
+        token_usage: Dict[str, int] = {}
+        # Get the token usage from the response.
+        # Includes prompt, completion, and total tokens used.
+        _keys = {"completion_tokens", "prompt_tokens", "total_tokens"}
+        text = ''
+        for _prompts in sub_prompts:
+            if self.streaming:
+                text_with_prompt = ""
+                prompt = _prompts[0]
+                if len(_prompts) > 1:
+                    raise ValueError("Cannot stream results with multiple prompts.")
+                params["stream"] = True
+                response = _streaming_response_template()
+                first = True
+                for stream_resp in completion_with_retry(
+                        self, prompt=_prompts, **params
+                ):
+                    if first:
+                        stream_resp["choices"][0]["text"] = prompt + stream_resp["choices"][0]["text"]
+                        first = False
+                    text_chunk = stream_resp["choices"][0]["text"]
+                    text_with_prompt += text_chunk
+                    text = self.prompter.get_response(text_with_prompt, prompt=prompt,
+                                                      sanitize_bot_response=self.sanitize_bot_response)
+                    if run_manager:
+                        run_manager.on_llm_new_token(
+                            text_chunk,
+                            verbose=self.verbose,
+                            logprobs=stream_resp["choices"][0]["logprobs"],
+                        )
+                    _update_response(response, stream_resp)
+                choices.extend(response["choices"])
+            else:
+                response = completion_with_retry(self, prompt=_prompts, **params)
+                choices.extend(response["choices"])
+            if not self.streaming:
+                # Can't update token usage if streaming
+                update_token_usage(_keys, response, token_usage)
+        choices[0]['text'] = text
+        return self.create_llm_result(choices, prompts, token_usage)
 class H2OChatOpenAI(ChatOpenAI):
             sanitize_bot_response=False,
             verbose=False,
             ):
+    if use_openai_model or inference_server.startswith('openai') or inference_server.startswith('vllm'):
         if use_openai_model and model_name is None:
             model_name = "gpt-3.5-turbo"
+        openai, inf_type = set_openai(
+            inference_server)  # FIXME: Will later import be ignored?  I think so, so should be fine
+        kwargs_extra = {}
+        if inference_server == 'openai_chat' or inf_type == 'vllm_chat':
             cls = H2OChatOpenAI
+        else:
+            cls = H2OOpenAI
+            if inf_type == 'vllm':
+                terminate_response = prompter.terminate_response or []
+                stop_sequences = list(set(terminate_response + [prompter.PreResponse]))
+                stop_sequences = [x for x in stop_sequences if x]
+                kwargs_extra = dict(stop_sequences=stop_sequences,
+                                    sanitize_bot_response=sanitize_bot_response,
+                                    prompter=prompter,
+                                    tokenizer=tokenizer,
+                                    client=None)
         callbacks = [StreamingGradioCallbackHandler()]
         llm = cls(model_name=model_name,
                   temperature=temperature if do_sample else 0,
                   frequency_penalty=0,
                   presence_penalty=1.07 - repetition_penalty + 0.6,  # so good default
                   callbacks=callbacks if stream_output else None,
+                  openai_api_key=openai.api_key,
+                  openai_api_base=openai.api_base,
+                  logit_bias=None if inf_type =='vllm' else {},
+                  max_retries=2,
+                  streaming=stream_output,
+                  **kwargs_extra
                   )
         streamer = callbacks[0] if stream_output else None
         if inference_server in ['openai', 'openai_chat']:
             prompt_type = inference_server
         else:
+            # vllm goes here
             prompt_type = prompt_type or 'plain'
     elif inference_server:
         assert inference_server.startswith(
     return sources
 image_types = ["png", "jpg", "jpeg"]
 non_image_types = ["pdf", "txt", "csv", "toml", "py", "rst", "rtf",
                    "md",
                    ]
 # "msg",  GPL3
+if have_libreoffice or True:
+    # or True so it tries to load, e.g. on MAC/Windows, even if don't have libreoffice since works without that
     non_image_types.extend(["docx", "doc", "xls", "xlsx"])
 file_types = non_image_types + image_types
 def add_meta(docs1, file):
     file_extension = pathlib.Path(file).suffix
     hashid = hash_file(file)
+    doc_hash = str(uuid.uuid4())[:10]
     if not isinstance(docs1, (list, tuple, types.GeneratorType)):
         docs1 = [docs1]
+    [x.metadata.update(dict(input_type=file_extension, date=str(datetime.now()), hashid=hashid, doc_hash=doc_hash)) for
+     x in docs1]
 def file_to_doc(file, base_path=None, verbose=False, fail_any_exception=False,
         add_meta(docs1, file)
         docs1 = clean_doc(docs1)
         doc1 = chunk_sources(docs1, chunk=chunk, chunk_size=chunk_size, language=Language.HTML)
+    elif (file.lower().endswith('.docx') or file.lower().endswith('.doc')) and (have_libreoffice or True):
         docs1 = UnstructuredWordDocumentLoader(file_path=file).load()
         add_meta(docs1, file)
         doc1 = chunk_sources(docs1, chunk=chunk, chunk_size=chunk_size)
+    elif (file.lower().endswith('.xlsx') or file.lower().endswith('.xls')) and (have_libreoffice or True):
         docs1 = UnstructuredExcelLoader(file_path=file).load()
         add_meta(docs1, file)
         doc1 = chunk_sources(docs1, chunk=chunk, chunk_size=chunk_size)
                cut_distanct=1.1,
                sanitize_bot_response=False,
                show_rank=False,
+               use_llm_if_no_docs=False,
                load_db_if_exists=False,
                db=None,
                do_sample=False,
                num_return_sequences=1,
                langchain_mode=None,
                langchain_action=None,
+               langchain_agents=None,
                document_subset=DocumentChoices.Relevant.name,
                document_choice=[],
                n_jobs=-1,
         formatted_doc_chunks = '\n\n'.join([get_url(x) + '\n\n' + x.page_content for x in docs])
         yield formatted_doc_chunks, ''
         return
+    if not use_llm_if_no_docs:
+        if not docs and langchain_action in [LangChainAction.SUMMARIZE_MAP.value,
+                                             LangChainAction.SUMMARIZE_ALL.value,
+                                             LangChainAction.SUMMARIZE_REFINE.value]:
+            ret = 'No relevant documents to summarize.' if have_any_docs else 'No documents to summarize.'
+            extra = ''
+            yield ret, extra
+            return
+        if not docs and langchain_mode not in [LangChainMode.DISABLED.value,
+                                               LangChainMode.CHAT_LLM.value,
+                                               LangChainMode.LLM.value]:
+            ret = 'No relevant documents to query.' if have_any_docs else 'No documents to query.'
+            extra = ''
+            yield ret, extra
+            return
     if chain is None and model_name not in non_hf_types:
         # here if no docs at all and not HF type
               db=None,
               langchain_mode=None,
               langchain_action=None,
+              langchain_agents=None,
               document_subset=DocumentChoices.Relevant.name,
               document_choice=[],
               n_jobs=-1,
               auto_reduce_chunks=True,
               max_chunks=100,
               ):
+    assert langchain_agents is not None  # should be at least []
     # determine whether use of context out of docs is planned
     if not use_openai_model and prompt_type not in ['plain'] or model_name in non_hf_types:
         if langchain_mode in ['Disabled', 'ChatLLM', 'LLM']:
                                for result in zip(db_documents, db_metadatas)]
             # order documents
+            doc_hashes = [x.get('doc_hash', 'None') for x in db_metadatas]
+            doc_chunk_ids = [x.get('chunk_id', 0) for x in db_metadatas]
             docs_with_score = [x for _, _, x in
                                sorted(zip(doc_hashes, doc_chunk_ids, docs_with_score), key=lambda x: (x[0], x[1]))
                                ]
 def chunk_sources(sources, chunk=True, chunk_size=512, language=None):
     if not chunk:
+        [x.metadata.update(dict(chunk_id=chunk_id)) for chunk_id, x in enumerate(sources)]
         return sources
     if not isinstance(sources, (list, tuple, types.GeneratorType)) and not callable(sources):
         # if just one document
     source_chunks = splitter.split_documents(sources)
     # currently in order, but when pull from db won't be, so mark order and document by hash
+    [x.metadata.update(dict(chunk_id=chunk_id)) for chunk_id, x in enumerate(source_chunks)]
     return source_chunks

gradio_runner.py CHANGED Viewed

@@ -58,7 +58,7 @@ from prompter import prompt_type_to_model_name, prompt_types_strings, inv_prompt
 from utils import get_githash, flatten_list, zip_data, s3up, clear_torch_cache, get_torch_allocated, system_info_print, \
     ping, get_short_name, makedirs, get_kwargs, remove, system_info, ping_gpu, get_url, get_local_ip
 from gen import get_model, languages_covered, evaluate, score_qa, langchain_modes, inputs_kwargs_list, scratch_base_dir, \
-    get_max_max_new_tokens, get_minmax_top_k_docs, history_to_context, langchain_actions
 from evaluate_params import eval_func_param_names, no_default_param_names, eval_func_param_names_defaults
 from apscheduler.schedulers.background import BackgroundScheduler
@@ -101,6 +101,7 @@ def go_gradio(**kwargs):
     db_type = kwargs['db_type']
     visible_langchain_modes = kwargs['visible_langchain_modes']
     visible_langchain_actions = kwargs['visible_langchain_actions']
     allow_upload_to_user_data = kwargs['allow_upload_to_user_data']
     allow_upload_to_my_data = kwargs['allow_upload_to_my_data']
     enable_sources_list = kwargs['enable_sources_list']
@@ -361,6 +362,14 @@ def go_gradio(**kwargs):
                         value=allowed_actions[0] if len(allowed_actions) > 0 else None,
                         label="Action",
                         visible=True)
             col_tabs = gr.Column(elem_id="col_container", scale=10)
             with (col_tabs, gr.Tabs()):
                 with gr.TabItem("Chat"):
@@ -469,6 +478,7 @@ def go_gradio(**kwargs):
                                                                value=None,
                                                                interactive=True,
                                                                multiselect=False,
                                                                )
                         with gr.Column(scale=4):
                             pass
@@ -1035,6 +1045,8 @@ def go_gradio(**kwargs):
                 user_kwargs['langchain_mode'] = 'Disabled'
             if 'langchain_action' not in user_kwargs:
                 user_kwargs['langchain_action'] = LangChainAction.QUERY.value
             set1 = set(list(default_kwargs1.keys()))
             set2 = set(eval_func_param_names)
@@ -1216,6 +1228,7 @@ def go_gradio(**kwargs):
             prompt_type1 = args_list[eval_func_param_names.index('prompt_type')]
             langchain_mode1 = args_list[eval_func_param_names.index('langchain_mode')]
             langchain_action1 = args_list[eval_func_param_names.index('langchain_action')]
             document_subset1 = args_list[eval_func_param_names.index('document_subset')]
             document_choice1 = args_list[eval_func_param_names.index('document_choice')]
             if not prompt_type1:
@@ -1312,6 +1325,7 @@ def go_gradio(**kwargs):
             args_list = args_list[:-3]  # only keep rest needed for evaluate()
             langchain_mode1 = args_list[eval_func_param_names.index('langchain_mode')]
             langchain_action1 = args_list[eval_func_param_names.index('langchain_action')]
             document_subset1 = args_list[eval_func_param_names.index('document_subset')]
             document_choice1 = args_list[eval_func_param_names.index('document_choice')]
             if not history:

 from utils import get_githash, flatten_list, zip_data, s3up, clear_torch_cache, get_torch_allocated, system_info_print, \
     ping, get_short_name, makedirs, get_kwargs, remove, system_info, ping_gpu, get_url, get_local_ip
 from gen import get_model, languages_covered, evaluate, score_qa, langchain_modes, inputs_kwargs_list, scratch_base_dir, \
+    get_max_max_new_tokens, get_minmax_top_k_docs, history_to_context, langchain_actions, langchain_agents_list
 from evaluate_params import eval_func_param_names, no_default_param_names, eval_func_param_names_defaults
 from apscheduler.schedulers.background import BackgroundScheduler
     db_type = kwargs['db_type']
     visible_langchain_modes = kwargs['visible_langchain_modes']
     visible_langchain_actions = kwargs['visible_langchain_actions']
+    visible_langchain_agents = kwargs['visible_langchain_agents']
     allow_upload_to_user_data = kwargs['allow_upload_to_user_data']
     allow_upload_to_my_data = kwargs['allow_upload_to_my_data']
     enable_sources_list = kwargs['enable_sources_list']
                         value=allowed_actions[0] if len(allowed_actions) > 0 else None,
                         label="Action",
                         visible=True)
+                    allowed_agents = [x for x in langchain_agents_list if x in visible_langchain_agents]
+                    langchain_agents = gr.Dropdown(
+                        langchain_agents_list,
+                        value=kwargs['langchain_agents'],
+                        label="Agents",
+                        multiselect=True,
+                        interactive=True,
+                        visible=False)  # WIP
             col_tabs = gr.Column(elem_id="col_container", scale=10)
             with (col_tabs, gr.Tabs()):
                 with gr.TabItem("Chat"):
                                                                value=None,
                                                                interactive=True,
                                                                multiselect=False,
+                                                               visible=True,
                                                                )
                         with gr.Column(scale=4):
                             pass
                 user_kwargs['langchain_mode'] = 'Disabled'
             if 'langchain_action' not in user_kwargs:
                 user_kwargs['langchain_action'] = LangChainAction.QUERY.value
+            if 'langchain_agents' not in user_kwargs:
+                user_kwargs['langchain_agents'] = []
             set1 = set(list(default_kwargs1.keys()))
             set2 = set(eval_func_param_names)
             prompt_type1 = args_list[eval_func_param_names.index('prompt_type')]
             langchain_mode1 = args_list[eval_func_param_names.index('langchain_mode')]
             langchain_action1 = args_list[eval_func_param_names.index('langchain_action')]
+            langchain_agents1 = args_list[eval_func_param_names.index('langchain_agents')]
             document_subset1 = args_list[eval_func_param_names.index('document_subset')]
             document_choice1 = args_list[eval_func_param_names.index('document_choice')]
             if not prompt_type1:
             args_list = args_list[:-3]  # only keep rest needed for evaluate()
             langchain_mode1 = args_list[eval_func_param_names.index('langchain_mode')]
             langchain_action1 = args_list[eval_func_param_names.index('langchain_action')]
+            langchain_agents1 = args_list[eval_func_param_names.index('langchain_agents')]
             document_subset1 = args_list[eval_func_param_names.index('document_subset')]
             document_choice1 = args_list[eval_func_param_names.index('document_choice')]
             if not history:

prompter.py CHANGED Viewed

@@ -582,6 +582,20 @@ ASSISTANT:
             # if add space here, non-unique tokenization will often make LLM produce wrong output
             PreResponse = PreResponse
         # generates_leading_space = True
     else:
         raise RuntimeError("No such prompt_type=%s" % prompt_type)
@@ -810,9 +824,20 @@ class Prompter(object):
                 if oi > 0:
                     # post fix outputs with seperator
                     output += '\n'
             outputs[oi] = output
         # join all outputs, only one extra new line between outputs
         output = '\n'.join(outputs)
         if self.debug:
             print("outputclean:\n%s" % '\n\n'.join(outputs), flush=True)
         return output

             # if add space here, non-unique tokenization will often make LLM produce wrong output
             PreResponse = PreResponse
         # generates_leading_space = True
+    elif prompt_type in [PromptType.guanaco.value, str(PromptType.guanaco.value),
+                         PromptType.guanaco.name]:
+        # https://huggingface.co/TheBloke/guanaco-65B-GPTQ
+        promptA = promptB = "" if not (chat and reduced) else ''
+        PreInstruct = """### Human: """
+        PreInput = None
+        PreResponse = """### Assistant:"""
+        terminate_response = ['### Human:']  # but only allow terminate after prompt is found correctly, else can't terminate
+        chat_turn_sep = chat_sep = '\n'
+        humanstr = PreInstruct
+        botstr = PreResponse
     else:
         raise RuntimeError("No such prompt_type=%s" % prompt_type)
                 if oi > 0:
                     # post fix outputs with seperator
                     output += '\n'
+            output = self.fix_text(self.prompt_type, output)
             outputs[oi] = output
         # join all outputs, only one extra new line between outputs
         output = '\n'.join(outputs)
         if self.debug:
             print("outputclean:\n%s" % '\n\n'.join(outputs), flush=True)
         return output
+    @staticmethod
+    def fix_text(prompt_type1, text1):
+        if prompt_type1 == 'human_bot':
+            # hack bug in vLLM with stopping, stops right, but doesn't return last token
+            hfix = '<human'
+            if text1.endswith(hfix):
+                text1 = text1[:-len(hfix)]
+        return text1

utils.py CHANGED Viewed

@@ -950,7 +950,6 @@ try:
 except (pkg_resources.DistributionNotFound, AssertionError):
     have_langchain = False
 import distutils.spawn
 have_tesseract = distutils.spawn.find_executable("tesseract")
@@ -985,3 +984,20 @@ except (pkg_resources.DistributionNotFound, AssertionError):
 # disable, hangs too often
 have_playwright = False

 except (pkg_resources.DistributionNotFound, AssertionError):
     have_langchain = False
 import distutils.spawn
 have_tesseract = distutils.spawn.find_executable("tesseract")
 # disable, hangs too often
 have_playwright = False
+def set_openai(inference_server):
+    if inference_server.startswith('vllm'):
+        import openai_vllm
+        openai_vllm.api_key = "EMPTY"
+        inf_type = inference_server.split(':')[0]
+        ip_vllm = inference_server.split(':')[1]
+        port_vllm = inference_server.split(':')[2]
+        openai_vllm.api_base = f"http://{ip_vllm}:{port_vllm}/v1"
+        return openai_vllm, inf_type
+    else:
+        import openai
+        openai.api_key = os.getenv("OPENAI_API_KEY")
+        openai.api_base = os.environ.get("OPENAI_API_BASE", "https://api.openai.com/v1")
+        inf_type = inference_server
+        return openai, inf_type