h2ogpt-chatbot

Running

App Files Files Community

pseudotensor commited on Jul 20, 2023

Commit

454e203

•

1 Parent(s): eac73aa

Update with h2oGPT hash ad9d685b188cece0b9c69716ea8e320b74f0caf7

Browse files

Files changed (17) hide show

client_test.py +26 -10
enums.py +24 -6
evaluate_params.py +5 -0
gen.py +186 -58
gpt4all_llm.py +18 -8
gpt_langchain.py +314 -145
gradio_runner.py +470 -178
gradio_utils/__init__.py +0 -0
gradio_utils/__pycache__/__init__.cpython-310.pyc +0 -0
gradio_utils/__pycache__/css.cpython-310.pyc +0 -0
gradio_utils/css.py +4 -0
h2oai_pipeline.py +4 -1
iterators/__pycache__/timeout_iterator.cpython-310.pyc +0 -0
iterators/timeout_iterator.py +1 -1
prompter.py +53 -0
requirements.txt +16 -16
utils.py +100 -7

client_test.py CHANGED Viewed

@@ -48,7 +48,7 @@ import markdown  # pip install markdown
 import pytest
 from bs4 import BeautifulSoup  # pip install beautifulsoup4
-from enums import DocumentChoices, LangChainAction
 debug = False
@@ -68,7 +68,9 @@ def get_args(prompt, prompt_type, chat=False, stream_output=False,
              max_new_tokens=50,
              top_k_docs=3,
              langchain_mode='Disabled',
              langchain_action=LangChainAction.QUERY.value,
              prompt_dict=None):
     from collections import OrderedDict
     kwargs = OrderedDict(instruction=prompt if chat else '',  # only for chat=True
@@ -94,11 +96,13 @@ def get_args(prompt, prompt_type, chat=False, stream_output=False,
                          instruction_nochat=prompt if not chat else '',
                          iinput_nochat='',  # only for chat=False
                          langchain_mode=langchain_mode,
                          langchain_action=langchain_action,
                          top_k_docs=top_k_docs,
                          chunk=True,
                          chunk_size=512,
-                         document_subset=DocumentChoices.Relevant.name,
                          document_choice=[],
                          )
     from evaluate_params import eval_func_param_names
@@ -202,9 +206,11 @@ def run_client_nochat_api_lean_morestuff(prompt, prompt_type='human_bot', max_ne
         instruction_nochat=prompt,
         iinput_nochat='',
         langchain_mode='Disabled',
         langchain_action=LangChainAction.QUERY.value,
         top_k_docs=4,
-        document_subset=DocumentChoices.Relevant.name,
         document_choice=[],
     )
@@ -225,23 +231,30 @@ def run_client_nochat_api_lean_morestuff(prompt, prompt_type='human_bot', max_ne
 @pytest.mark.skip(reason="For manual use against some server, no server launched")
 def test_client_chat(prompt_type='human_bot'):
     return run_client_chat(prompt='Who are you?', prompt_type=prompt_type, stream_output=False, max_new_tokens=50,
-                           langchain_mode='Disabled', langchain_action=LangChainAction.QUERY.value)
 @pytest.mark.skip(reason="For manual use against some server, no server launched")
 def test_client_chat_stream(prompt_type='human_bot'):
     return run_client_chat(prompt="Tell a very long kid's story about birds.", prompt_type=prompt_type,
                            stream_output=True, max_new_tokens=512,
-                           langchain_mode='Disabled', langchain_action=LangChainAction.QUERY.value)
-def run_client_chat(prompt, prompt_type, stream_output, max_new_tokens, langchain_mode, langchain_action,
                     prompt_dict=None):
     client = get_client(serialize=False)
     kwargs, args = get_args(prompt, prompt_type, chat=True, stream_output=stream_output,
-                            max_new_tokens=max_new_tokens, langchain_mode=langchain_mode,
                             langchain_action=langchain_action,
                             prompt_dict=prompt_dict)
     return run_client(client, prompt, args, kwargs)
@@ -285,15 +298,18 @@ def run_client(client, prompt, args, kwargs, do_md_to_text=True, verbose=False):
 def test_client_nochat_stream(prompt_type='human_bot'):
     return run_client_nochat_gen(prompt="Tell a very long kid's story about birds.", prompt_type=prompt_type,
                                  stream_output=True, max_new_tokens=512,
-                                 langchain_mode='Disabled', langchain_action=LangChainAction.QUERY.value)
-def run_client_nochat_gen(prompt, prompt_type, stream_output, max_new_tokens, langchain_mode, langchain_action):
     client = get_client(serialize=False)
     kwargs, args = get_args(prompt, prompt_type, chat=False, stream_output=stream_output,
                             max_new_tokens=max_new_tokens, langchain_mode=langchain_mode,
-                            langchain_action=langchain_action)
     return run_client_gen(client, prompt, args, kwargs)

 import pytest
 from bs4 import BeautifulSoup  # pip install beautifulsoup4
+from enums import DocumentSubset, LangChainAction
 debug = False
              max_new_tokens=50,
              top_k_docs=3,
              langchain_mode='Disabled',
+             add_chat_history_to_context=True,
              langchain_action=LangChainAction.QUERY.value,
+             langchain_agents=[],
              prompt_dict=None):
     from collections import OrderedDict
     kwargs = OrderedDict(instruction=prompt if chat else '',  # only for chat=True
                          instruction_nochat=prompt if not chat else '',
                          iinput_nochat='',  # only for chat=False
                          langchain_mode=langchain_mode,
+                         add_chat_history_to_context=add_chat_history_to_context,
                          langchain_action=langchain_action,
+                         langchain_agents=langchain_agents,
                          top_k_docs=top_k_docs,
                          chunk=True,
                          chunk_size=512,
+                         document_subset=DocumentSubset.Relevant.name,
                          document_choice=[],
                          )
     from evaluate_params import eval_func_param_names
         instruction_nochat=prompt,
         iinput_nochat='',
         langchain_mode='Disabled',
+        add_chat_history_to_context=True,
         langchain_action=LangChainAction.QUERY.value,
+        langchain_agents=[],
         top_k_docs=4,
+        document_subset=DocumentSubset.Relevant.name,
         document_choice=[],
     )
 @pytest.mark.skip(reason="For manual use against some server, no server launched")
 def test_client_chat(prompt_type='human_bot'):
     return run_client_chat(prompt='Who are you?', prompt_type=prompt_type, stream_output=False, max_new_tokens=50,
+                           langchain_mode='Disabled',
+                           langchain_action=LangChainAction.QUERY.value,
+                           langchain_agents=[])
 @pytest.mark.skip(reason="For manual use against some server, no server launched")
 def test_client_chat_stream(prompt_type='human_bot'):
     return run_client_chat(prompt="Tell a very long kid's story about birds.", prompt_type=prompt_type,
                            stream_output=True, max_new_tokens=512,
+                           langchain_mode='Disabled',
+                           langchain_action=LangChainAction.QUERY.value,
+                           langchain_agents=[])
+def run_client_chat(prompt, prompt_type, stream_output, max_new_tokens,
+                    langchain_mode, langchain_action, langchain_agents,
                     prompt_dict=None):
     client = get_client(serialize=False)
     kwargs, args = get_args(prompt, prompt_type, chat=True, stream_output=stream_output,
+                            max_new_tokens=max_new_tokens,
+                            langchain_mode=langchain_mode,
                             langchain_action=langchain_action,
+                            langchain_agents=langchain_agents,
                             prompt_dict=prompt_dict)
     return run_client(client, prompt, args, kwargs)
 def test_client_nochat_stream(prompt_type='human_bot'):
     return run_client_nochat_gen(prompt="Tell a very long kid's story about birds.", prompt_type=prompt_type,
                                  stream_output=True, max_new_tokens=512,
+                                 langchain_mode='Disabled',
+                                 langchain_action=LangChainAction.QUERY.value,
+                                 langchain_agents=[])
+def run_client_nochat_gen(prompt, prompt_type, stream_output, max_new_tokens,
+                          langchain_mode, langchain_action, langchain_agents):
     client = get_client(serialize=False)
     kwargs, args = get_args(prompt, prompt_type, chat=False, stream_output=stream_output,
                             max_new_tokens=max_new_tokens, langchain_mode=langchain_mode,
+                            langchain_action=langchain_action, langchain_agents=langchain_agents)
     return run_client_gen(client, prompt, args, kwargs)

enums.py CHANGED Viewed

@@ -31,25 +31,30 @@ class PromptType(Enum):
     mptinstruct = 25
     mptchat = 26
     falcon = 27
-class DocumentChoices(Enum):
     Relevant = 0
-    Sources = 1
-    All = 2
 non_query_commands = [
-    DocumentChoices.Sources.name,
-    DocumentChoices.All.name
 ]
 class LangChainMode(Enum):
     """LangChain mode"""
     DISABLED = "Disabled"
-    CHAT_LLM = "ChatLLM"
     LLM = "LLM"
     ALL = "All"
     WIKI = "wiki"
@@ -60,6 +65,12 @@ class LangChainMode(Enum):
     H2O_DAI_DOCS = "DriverlessAI docs"
 class LangChainAction(Enum):
     """LangChain action"""
@@ -71,6 +82,13 @@ class LangChainAction(Enum):
     SUMMARIZE_REFINE = "Summarize_refine"
 no_server_str = no_lora_str = no_model_str = '[None/Remove]'
 # from site-packages/langchain/llms/openai.py

     mptinstruct = 25
     mptchat = 26
     falcon = 27
+    guanaco = 28
+    llama2 = 29
+class DocumentSubset(Enum):
     Relevant = 0
+    RelSources = 1
+    TopKSources = 2
 non_query_commands = [
+    DocumentSubset.RelSources.name,
+    DocumentSubset.TopKSources.name
 ]
+class DocumentChoice(Enum):
+    ALL = 'All'
 class LangChainMode(Enum):
     """LangChain mode"""
     DISABLED = "Disabled"
     LLM = "LLM"
     ALL = "All"
     WIKI = "wiki"
     H2O_DAI_DOCS = "DriverlessAI docs"
+# modes should not be removed from visible list or added by name
+langchain_modes_intrinsic = [LangChainMode.DISABLED.value,
+                             LangChainMode.LLM.value,
+                             LangChainMode.MY_DATA.value]
 class LangChainAction(Enum):
     """LangChain action"""
     SUMMARIZE_REFINE = "Summarize_refine"
+class LangChainAgent(Enum):
+    """LangChain agents"""
+    SEARCH = "Search"
+    # CSV = "csv"  # WIP
 no_server_str = no_lora_str = no_model_str = '[None/Remove]'
 # from site-packages/langchain/llms/openai.py

evaluate_params.py CHANGED Viewed

@@ -1,3 +1,6 @@
 no_default_param_names = [
     'instruction',
     'iinput',
@@ -30,7 +33,9 @@ eval_func_param_names = ['instruction',
                          'instruction_nochat',
                          'iinput_nochat',
                          'langchain_mode',
                          'langchain_action',
                          'top_k_docs',
                          'chunk',
                          'chunk_size',

+input_args_list = ['model_state', 'my_db_state', 'selection_docs_state']
 no_default_param_names = [
     'instruction',
     'iinput',
                          'instruction_nochat',
                          'iinput_nochat',
                          'langchain_mode',
+                         'add_chat_history_to_context',
                          'langchain_action',
+                         'langchain_agents',
                          'top_k_docs',
                          'chunk',
                          'chunk_size',

gen.py CHANGED Viewed

@@ -8,7 +8,6 @@ import sys
 import os
 import time
 import traceback
-import types
 import typing
 import warnings
 from datetime import datetime
@@ -28,12 +27,12 @@ os.environ['BITSANDBYTES_NOWELCOME'] = '1'
 warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
 from evaluate_params import eval_func_param_names, no_default_param_names
-from enums import DocumentChoices, LangChainMode, no_lora_str, model_token_mapping, no_model_str, source_prefix, \
-    source_postfix, LangChainAction
 from loaders import get_loaders
 from utils import set_seed, clear_torch_cache, save_generate_output, NullContext, wrapped_partial, EThread, get_githash, \
     import_matplotlib, get_device, makedirs, get_kwargs, start_faulthandler, get_hf_server, FakeTokenizer, remove, \
-    have_langchain
 start_faulthandler()
 import_matplotlib()
@@ -50,10 +49,10 @@ from transformers import GenerationConfig, AutoModel, TextIteratorStreamer
 from prompter import Prompter, inv_prompt_type_to_model_lower, non_hf_types, PromptType, get_prompt, generate_prompt
 from stopping import get_stopping
-langchain_modes = [x.value for x in list(LangChainMode)]
 langchain_actions = [x.value for x in list(LangChainAction)]
 scratch_base_dir = '/tmp/'
@@ -114,6 +113,7 @@ def main(
         show_examples: bool = None,
         verbose: bool = False,
         h2ocolors: bool = True,
         height: int = 600,
         show_lora: bool = True,
         login_mode_if_model0: bool = False,
@@ -134,7 +134,7 @@ def main(
         extra_lora_options: typing.List[str] = [],
         extra_server_options: typing.List[str] = [],
-        score_model: str = 'OpenAssistant/reward-model-deberta-v3-large-v2',
         eval_filename: str = None,
         eval_prompts_only_num: int = 0,
@@ -143,22 +143,30 @@ def main(
         langchain_mode: str = None,
         langchain_action: str = LangChainAction.QUERY.value,
         force_langchain_evaluate: bool = False,
         visible_langchain_modes: list = ['UserData', 'MyData'],
         # WIP:
         # visible_langchain_actions: list = langchain_actions.copy(),
         visible_langchain_actions: list = [LangChainAction.QUERY.value, LangChainAction.SUMMARIZE_MAP.value],
-        document_subset: str = DocumentChoices.Relevant.name,
-        document_choice: list = [],
         user_path: str = None,
         detect_user_path_changes_every_query: bool = False,
         load_db_if_exists: bool = True,
         keep_sources_in_context: bool = False,
         db_type: str = 'chroma',
         use_openai_embedding: bool = False,
         use_openai_model: bool = False,
         hf_embedding_model: str = None,
         allow_upload_to_user_data: bool = True,
         allow_upload_to_my_data: bool = True,
         enable_url_upload: bool = True,
         enable_text_upload: bool = True,
@@ -175,6 +183,7 @@ def main(
         pre_load_caption_model: bool = False,
         caption_gpu: bool = True,
         enable_ocr: bool = False,
 ):
     """
@@ -196,6 +205,8 @@ def main(
                              Or Address can be "openai_chat" or "openai" for OpenAI API
                              e.g. python generate.py --inference_server="openai_chat" --base_model=gpt-3.5-turbo
                              e.g. python generate.py --inference_server="openai" --base_model=text-davinci-003
     :param prompt_type: type of prompt, usually matched to fine-tuned model or plain for foundational model
     :param prompt_dict: If prompt_type=custom, then expects (some) items returned by get_prompt(..., return_dict=True)
     :param model_lock: Lock models to specific combinations, for ease of use and extending to many models
@@ -252,6 +263,7 @@ def main(
     :param show_examples: whether to show clickable examples in gradio
     :param verbose: whether to show verbose prints
     :param h2ocolors: whether to use H2O.ai theme
     :param height: height of chat window
     :param show_lora: whether to show LORA options in UI (expert so can be hard to understand)
     :param login_mode_if_model0: set to True to load --base_model after client logs in, to be able to free GPU memory when model is swapped
@@ -271,49 +283,73 @@ def main(
     :param extra_model_options: extra models to show in list in gradio
     :param extra_lora_options: extra LORA to show in list in gradio
     :param extra_server_options: extra servers to show in list in gradio
-    :param score_model: which model to score responses (None means no scoring)
     :param eval_filename: json file to use for evaluation, if None is sharegpt
     :param eval_prompts_only_num: for no gradio benchmark, if using eval_filename prompts for eval instead of examples
     :param eval_prompts_only_seed: for no gradio benchmark, seed for eval_filename sampling
     :param eval_as_output: for no gradio benchmark, whether to test eval_filename output itself
     :param langchain_mode: Data source to include.  Choose "UserData" to only consume files from make_db.py.
            WARNING: wiki_full requires extra data processing via read_wiki_full.py and requires really good workstation to generate db, unless already present.
     :param langchain_action: Mode langchain operations in on documents.
             Query: Make query of document(s)
             Summarize or Summarize_map_reduce: Summarize document(s) via map_reduce
             Summarize_all: Summarize document(s) using entire document at once
             Summarize_refine: Summarize document(s) using entire document, and try to refine before returning summary
     :param force_langchain_evaluate: Whether to force langchain LLM use even if not doing langchain, mostly for testing.
     :param user_path: user path to glob from to generate db for vector search, for 'UserData' langchain mode.
            If already have db, any new/changed files are added automatically if path set, does not have to be same path used for prior db sources
     :param detect_user_path_changes_every_query: whether to detect if any files changed or added every similarity search (by file hashes).
            Expensive for large number of files, so not done by default.  By default only detect changes during db loading.
     :param visible_langchain_modes: dbs to generate at launch to be ready for LLM
            Can be up to ['wiki', 'wiki_full', 'UserData', 'MyData', 'github h2oGPT', 'DriverlessAI docs']
            But wiki_full is expensive and requires preparation
            To allow scratch space only live in session, add 'MyData' to list
            Default: If only want to consume local files, e.g. prepared by make_db.py, only include ['UserData']
-           FIXME: Avoid 'All' for now, not implemented
     :param visible_langchain_actions: Which actions to allow
     :param document_subset: Default document choice when taking subset of collection
-    :param document_choice: Chosen document(s) by internal name
     :param load_db_if_exists: Whether to load chroma db if exists or re-generate db
     :param keep_sources_in_context: Whether to keep url sources in context, not helpful usually
     :param db_type: 'faiss' for in-memory or 'chroma' or 'weaviate' for persisted on disk
     :param use_openai_embedding: Whether to use OpenAI embeddings for vector db
     :param use_openai_model: Whether to use OpenAI model for use with vector db
     :param hf_embedding_model: Which HF embedding model to use for vector db
-           Default is instructor-large with 768 parameters per embedding if have GPUs, else all-MiniLM-L6-v1 if no GPUs
            Can also choose simpler model with 384 parameters per embedding: "sentence-transformers/all-MiniLM-L6-v2"
            Can also choose even better embedding with 1024 parameters: 'hkunlp/instructor-xl'
            We support automatically changing of embeddings for chroma, with a backup of db made if this is done
-    :param allow_upload_to_user_data: Whether to allow file uploads to update shared vector db
     :param allow_upload_to_my_data: Whether to allow file uploads to update scratch vector db
     :param enable_url_upload: Whether to allow upload from URL
     :param enable_text_upload: Whether to allow upload of text
     :param enable_sources_list: Whether to allow list (or download for non-shared db) of list of sources for chosen db
     :param chunk: Whether to chunk data (True unless know data is already optimally chunked)
-    :param chunk_size: Size of chunks, with typically top-4 passed to LLM, so neesd to be in context length
     :param top_k_docs: number of chunks to give LLM
     :param reverse_docs: whether to reverse docs order so most relevant is closest to question.
            Best choice for sufficiently smart model, and truncation occurs for oldest context, so best then too.
@@ -327,11 +363,15 @@ def main(
            captions_model: str = "Salesforce/blip2-flan-t5-xl",   # question/answer capable, 16GB state
            captions_model: str = "Salesforce/blip2-flan-t5-xxl",  # question/answer capable, 60GB state
            Note: opt-based blip2 are not permissive license due to opt and Meta license restrictions
     :param pre_load_caption_model: Whether to preload caption model, or load after forking parallel doc loader
            parallel loading disabled if preload and have images, to prevent deadlocking on cuda context
            Recommended if using larger caption model
     :param caption_gpu: If support caption, then use GPU if exists
     :param enable_ocr: Whether to support OCR on images
     :return:
     """
     if base_model is None:
@@ -393,7 +433,29 @@ def main(
         if langchain_mode is not None:
             visible_langchain_modes += [langchain_mode]
     assert langchain_action in langchain_actions, "Invalid langchain_action %s" % langchain_action
     # if specifically chose not to show My or User Data, disable upload, so gradio elements are simpler
     if LangChainMode.MY_DATA.value not in visible_langchain_modes:
@@ -404,21 +466,22 @@ def main(
     # auto-set langchain_mode
     if have_langchain and langchain_mode is None:
         # start in chat mode, in case just want to chat and don't want to get "No documents to query" by default.
-        langchain_mode = LangChainMode.CHAT_LLM.value
-        if allow_upload_to_user_data and not is_public and user_path:
             print("Auto set langchain_mode=%s.  Could use UserData instead." % langchain_mode, flush=True)
         elif allow_upload_to_my_data:
             print("Auto set langchain_mode=%s.  Could use MyData instead."
                   "  To allow UserData to pull files from disk,"
-                  " set user_path and ensure allow_upload_to_user_data=True" % langchain_mode, flush=True)
         else:
             raise RuntimeError("Please pass --langchain_mode=<chosen mode> out of %s" % langchain_modes)
-    if not have_langchain and langchain_mode not in [None, LangChainMode.DISABLED.value, LangChainMode.LLM.value, LangChainMode.CHAT_LLM.value]:
         raise RuntimeError("Asked for LangChain mode but langchain python package cannot be found.")
     if langchain_mode is None:
         # if not set yet, disable
         langchain_mode = LangChainMode.DISABLED.value
-        print("Auto set langchain_mode=%s" % langchain_mode, flush=True)
     if is_public:
         allow_upload_to_user_data = False
@@ -474,7 +537,7 @@ def main(
         # HF accounted for later in get_max_max_new_tokens()
     save_dir = os.getenv('SAVE_DIR', save_dir)
     score_model = os.getenv('SCORE_MODEL', score_model)
-    if score_model == 'None' or score_model is None:
         score_model = ''
     concurrency_count = int(os.getenv('CONCURRENCY_COUNT', concurrency_count))
     api_open = bool(int(os.getenv('API_OPEN', str(int(api_open)))))
@@ -482,6 +545,7 @@ def main(
     n_gpus = torch.cuda.device_count() if torch.cuda.is_available else 0
     if n_gpus == 0:
         gpu_id = None
         load_8bit = False
         load_4bit = False
@@ -499,7 +563,11 @@ def main(
         if hf_embedding_model is None:
             # if no GPUs, use simpler embedding model to avoid cost in time
             hf_embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
     else:
         if hf_embedding_model is None:
             # if still None, then set default
             hf_embedding_model = 'hkunlp/instructor-large'
@@ -524,8 +592,6 @@ def main(
     if offload_folder:
         makedirs(offload_folder)
-    if user_path:
-        makedirs(user_path)
     placeholder_instruction, placeholder_input, \
         stream_output, show_examples, \
@@ -551,7 +617,7 @@ def main(
                             verbose,
                             )
-    git_hash = get_githash()
     locals_dict = locals()
     locals_print = '\n'.join(['%s: %s' % (k, v) for k, v in locals_dict.items()])
     if verbose:
@@ -565,7 +631,7 @@ def main(
             get_some_dbs_from_hf()
         dbs = {}
         for langchain_mode1 in visible_langchain_modes:
-            if langchain_mode1 in ['MyData']:
                 # don't use what is on disk, remove it instead
                 for gpath1 in glob.glob(os.path.join(scratch_base_dir, 'db_dir_%s*' % langchain_mode1)):
                     if os.path.isdir(gpath1):
@@ -580,7 +646,7 @@ def main(
                 db = prep_langchain(persist_directory1,
                                     load_db_if_exists,
                                     db_type, use_openai_embedding,
-                                    langchain_mode1, user_path,
                                     hf_embedding_model,
                                     kwargs_make_db=locals())
             finally:
@@ -599,6 +665,14 @@ def main(
     model_state_none = dict(model=None, tokenizer=None, device=None,
                             base_model=None, tokenizer_base_model=None, lora_weights=None,
                             inference_server=None, prompt_type=None, prompt_dict=None)
     if cli:
         from cli import run_cli
@@ -967,11 +1041,13 @@ def get_model(
         client = gr_client or hf_client
         # Don't return None, None for model, tokenizer so triggers
         return client, tokenizer, 'http'
-    if isinstance(inference_server, str) and inference_server.startswith('openai'):
-        assert os.getenv('OPENAI_API_KEY'), "Set environment for OPENAI_API_KEY"
-        # Don't return None, None for model, tokenizer so triggers
-        # include small token cushion
-        tokenizer = FakeTokenizer(model_max_length=model_token_mapping[base_model] - 50)
         return inference_server, tokenizer, inference_server
     assert not inference_server, "Malformed inference_server=%s" % inference_server
     if base_model in non_hf_types:
@@ -1255,6 +1331,7 @@ def get_score_model(score_model: str = None,
 def evaluate(
         model_state,
         my_db_state,
         # START NOTE: Examples must have same order of parameters
         instruction,
         iinput,
@@ -1277,7 +1354,9 @@ def evaluate(
         instruction_nochat,
         iinput_nochat,
         langchain_mode,
         langchain_action,
         top_k_docs,
         chunk,
         chunk_size,
@@ -1291,6 +1370,9 @@ def evaluate(
         save_dir=None,
         sanitize_bot_response=False,
         model_state0=None,
         memory_restriction_level=None,
         max_max_new_tokens=None,
         is_public=None,
@@ -1298,13 +1380,14 @@ def evaluate(
         raise_generate_gpu_exceptions=None,
         chat_context=None,
         lora_weights=None,
         load_db_if_exists=True,
         dbs=None,
-        user_path=None,
         detect_user_path_changes_every_query=None,
         use_openai_embedding=None,
         use_openai_model=None,
         hf_embedding_model=None,
         db_type=None,
         n_jobs=None,
         first_para=None,
@@ -1333,6 +1416,16 @@ def evaluate(
     assert chunk_size is not None and isinstance(chunk_size, int)
     assert n_jobs is not None
     assert first_para is not None
     if debug:
         locals_dict = locals().copy()
@@ -1452,18 +1545,24 @@ def evaluate(
     # THIRD PLACE where LangChain referenced, but imports only occur if enabled and have db to use
     assert langchain_mode in langchain_modes, "Invalid langchain_mode %s" % langchain_mode
     assert langchain_action in langchain_actions, "Invalid langchain_action %s" % langchain_action
-    if langchain_mode in ['MyData'] and my_db_state is not None and len(my_db_state) > 0 and my_db_state[0] is not None:
-        db1 = my_db_state[0]
-    elif dbs is not None and langchain_mode in dbs:
-        db1 = dbs[langchain_mode]
     else:
-        db1 = None
-    do_langchain_path = langchain_mode not in [False, 'Disabled', 'ChatLLM', 'LLM'] or \
                         base_model in non_hf_types or \
                         force_langchain_evaluate
     if do_langchain_path:
         outr = ""
-        # use smaller cut_distanct for wiki_full since so many matches could be obtained, and often irrelevant unless close
         from gpt_langchain import run_qa_db
         gen_hyper_langchain = dict(do_sample=do_sample,
                                    temperature=temperature,
@@ -1484,11 +1583,13 @@ def evaluate(
                            inference_server=inference_server,
                            stream_output=stream_output,
                            prompter=prompter,
                            load_db_if_exists=load_db_if_exists,
-                           db=db1,
-                           user_path=user_path,
                            detect_user_path_changes_every_query=detect_user_path_changes_every_query,
-                           cut_distanct=1.1 if langchain_mode in ['wiki_full'] else 1.64,  # FIXME, too arbitrary
                            use_openai_embedding=use_openai_embedding,
                            use_openai_model=use_openai_model,
                            hf_embedding_model=hf_embedding_model,
@@ -1498,6 +1599,7 @@ def evaluate(
                            chunk_size=chunk_size,
                            langchain_mode=langchain_mode,
                            langchain_action=langchain_action,
                            document_subset=document_subset,
                            document_choice=document_choice,
                            db_type=db_type,
@@ -1526,6 +1628,7 @@ def evaluate(
                               inference_server=inference_server,
                               langchain_mode=langchain_mode,
                               langchain_action=langchain_action,
                               document_subset=document_subset,
                               document_choice=document_choice,
                               num_prompt_tokens=num_prompt_tokens,
@@ -1549,12 +1652,12 @@ def evaluate(
             clear_torch_cache()
             return
-    if inference_server.startswith('openai') or inference_server.startswith('http'):
-        if inference_server.startswith('openai'):
-            import openai
             where_from = "openai_client"
-            openai.api_key = os.getenv("OPENAI_API_KEY")
             terminate_response = prompter.terminate_response or []
             stop_sequences = list(set(terminate_response + [prompter.PreResponse]))
             stop_sequences = [x for x in stop_sequences if x]
@@ -1567,7 +1670,7 @@ def evaluate(
                                      n=num_return_sequences,
                                      presence_penalty=1.07 - repetition_penalty + 0.6,  # so good default
                                      )
-            if inference_server == 'openai':
                 response = openai.Completion.create(
                     model=base_model,
                     prompt=prompt,
@@ -1590,7 +1693,9 @@ def evaluate(
                         yield dict(response=prompter.get_response(prompt + text, prompt=prompt,
                                                                   sanitize_bot_response=sanitize_bot_response),
                                    sources='')
-            elif inference_server == 'openai_chat':
                 response = openai.ChatCompletion.create(
                     model=base_model,
                     messages=[
@@ -1642,7 +1747,9 @@ def evaluate(
                 chat_client = False
                 where_from = "gr_client"
                 client_langchain_mode = 'Disabled'
                 client_langchain_action = LangChainAction.QUERY.value
                 gen_server_kwargs = dict(temperature=temperature,
                                          top_p=top_p,
                                          top_k=top_k,
@@ -1694,12 +1801,14 @@ def evaluate(
                                      instruction_nochat=gr_prompt if not chat_client else '',
                                      iinput_nochat=gr_iinput,  # only for chat=False
                                      langchain_mode=client_langchain_mode,
                                      langchain_action=client_langchain_action,
                                      top_k_docs=top_k_docs,
                                      chunk=chunk,
                                      chunk_size=chunk_size,
-                                     document_subset=DocumentChoices.Relevant.name,
-                                     document_choice=[],
                                      )
                 api_name = '/submit_nochat_api'  # NOTE: like submit_nochat but stable API for string dict passing
                 if not stream_output:
@@ -1993,7 +2102,7 @@ def evaluate(
 inputs_list_names = list(inspect.signature(evaluate).parameters)
-state_names = ['model_state', 'my_db_state']
 inputs_kwargs_list = [x for x in inputs_list_names if x not in eval_func_param_names + state_names]
@@ -2276,8 +2385,8 @@ y = np.random.randint(0, 1, 100)
     # move to correct position
     for example in examples:
-        example += [chat, '', '', LangChainMode.DISABLED.value, LangChainAction.QUERY.value,
-                    top_k_docs, chunk, chunk_size, [DocumentChoices.Relevant.name], []
                     ]
         # adjust examples if non-chat mode
         if not chat:
@@ -2337,7 +2446,7 @@ def score_qa(smodel, stokenizer, max_length_tokenize, question, answer, cutoff_l
                         truncation=True,
                         max_length=max_length_tokenize).to(smodel.device)
     try:
-        score = torch.sigmoid(smodel(**inputs).logits[0]).cpu().detach().numpy()[0]
     except torch.cuda.OutOfMemoryError as e:
         print("GPU OOM 3: question: %s answer: %s exception: %s" % (question, answer, str(e)), flush=True)
         del inputs
@@ -2383,14 +2492,14 @@ def check_locals(**kwargs):
 def get_model_max_length(model_state):
-    if not isinstance(model_state['tokenizer'], (str, types.NoneType)):
         return model_state['tokenizer'].model_max_length
     else:
         return 2048
 def get_max_max_new_tokens(model_state, **kwargs):
-    if not isinstance(model_state['tokenizer'], (str, types.NoneType)):
         max_max_new_tokens = model_state['tokenizer'].model_max_length
     else:
         max_max_new_tokens = None
@@ -2422,12 +2531,15 @@ def get_minmax_top_k_docs(is_public):
     return min_top_k_docs, max_top_k_docs, label_top_k_docs
-def history_to_context(history, langchain_mode1, prompt_type1, prompt_dict1, chat1, model_max_length1,
                        memory_restriction_level1, keep_sources_in_context1):
     """
     consumes all history up to (but not including) latest history item that is presumed to be an [instruction, None] pair
     :param history:
     :param langchain_mode1:
     :param prompt_type1:
     :param prompt_dict1:
     :param chat1:
@@ -2440,7 +2552,7 @@ def history_to_context(history, langchain_mode1, prompt_type1, prompt_dict1, cha
     _, _, _, max_prompt_length = get_cutoffs(memory_restriction_level1,
                                              for_context=True, model_max_length=model_max_length1)
     context1 = ''
-    if max_prompt_length is not None and langchain_mode1 not in ['LLM']:
         context1 = ''
         # - 1 below because current instruction already in history from user()
         for histi in range(0, len(history) - 1):
@@ -2476,6 +2588,22 @@ def history_to_context(history, langchain_mode1, prompt_type1, prompt_dict1, cha
     return context1
 def entrypoint_main():
     """
     Examples:

 import os
 import time
 import traceback
 import typing
 import warnings
 from datetime import datetime
 warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
 from evaluate_params import eval_func_param_names, no_default_param_names
+from enums import DocumentSubset, LangChainMode, no_lora_str, model_token_mapping, no_model_str, source_prefix, \
+    source_postfix, LangChainAction, LangChainAgent, DocumentChoice
 from loaders import get_loaders
 from utils import set_seed, clear_torch_cache, save_generate_output, NullContext, wrapped_partial, EThread, get_githash, \
     import_matplotlib, get_device, makedirs, get_kwargs, start_faulthandler, get_hf_server, FakeTokenizer, remove, \
+    have_langchain, set_openai, load_collection_enum
 start_faulthandler()
 import_matplotlib()
 from prompter import Prompter, inv_prompt_type_to_model_lower, non_hf_types, PromptType, get_prompt, generate_prompt
 from stopping import get_stopping
 langchain_actions = [x.value for x in list(LangChainAction)]
+langchain_agents_list = [x.value for x in list(LangChainAgent)]
 scratch_base_dir = '/tmp/'
         show_examples: bool = None,
         verbose: bool = False,
         h2ocolors: bool = True,
+        dark: bool = False,  # light tends to be best
         height: int = 600,
         show_lora: bool = True,
         login_mode_if_model0: bool = False,
         extra_lora_options: typing.List[str] = [],
         extra_server_options: typing.List[str] = [],
+        score_model: str = 'auto',
         eval_filename: str = None,
         eval_prompts_only_num: int = 0,
         langchain_mode: str = None,
         langchain_action: str = LangChainAction.QUERY.value,
+        langchain_agents: list = [],
         force_langchain_evaluate: bool = False,
+        langchain_modes: list = [x.value for x in list(LangChainMode)],
         visible_langchain_modes: list = ['UserData', 'MyData'],
         # WIP:
         # visible_langchain_actions: list = langchain_actions.copy(),
         visible_langchain_actions: list = [LangChainAction.QUERY.value, LangChainAction.SUMMARIZE_MAP.value],
+        visible_langchain_agents: list = langchain_agents_list.copy(),
+        document_subset: str = DocumentSubset.Relevant.name,
+        document_choice: list = [DocumentChoice.ALL.value],
         user_path: str = None,
+        langchain_mode_paths: dict = {'UserData': None},
         detect_user_path_changes_every_query: bool = False,
+        use_llm_if_no_docs: bool = False,
         load_db_if_exists: bool = True,
         keep_sources_in_context: bool = False,
         db_type: str = 'chroma',
         use_openai_embedding: bool = False,
         use_openai_model: bool = False,
         hf_embedding_model: str = None,
+        cut_distance: float = 1.64,
+        add_chat_history_to_context: bool = True,
         allow_upload_to_user_data: bool = True,
+        reload_langchain_state: bool = True,
         allow_upload_to_my_data: bool = True,
         enable_url_upload: bool = True,
         enable_text_upload: bool = True,
         pre_load_caption_model: bool = False,
         caption_gpu: bool = True,
         enable_ocr: bool = False,
+        enable_pdf_ocr: str = 'auto',
 ):
     """
                              Or Address can be "openai_chat" or "openai" for OpenAI API
                              e.g. python generate.py --inference_server="openai_chat" --base_model=gpt-3.5-turbo
                              e.g. python generate.py --inference_server="openai" --base_model=text-davinci-003
+                             Or Address can be "vllm:IP:port" or "vllm:IP:port" for OpenAI-compliant vLLM endpoint
+                             Note: vllm_chat not supported by vLLM project.
     :param prompt_type: type of prompt, usually matched to fine-tuned model or plain for foundational model
     :param prompt_dict: If prompt_type=custom, then expects (some) items returned by get_prompt(..., return_dict=True)
     :param model_lock: Lock models to specific combinations, for ease of use and extending to many models
     :param show_examples: whether to show clickable examples in gradio
     :param verbose: whether to show verbose prints
     :param h2ocolors: whether to use H2O.ai theme
+    :param dark: whether to use dark mode for UI by default (still controlled in UI)
     :param height: height of chat window
     :param show_lora: whether to show LORA options in UI (expert so can be hard to understand)
     :param login_mode_if_model0: set to True to load --base_model after client logs in, to be able to free GPU memory when model is swapped
     :param extra_model_options: extra models to show in list in gradio
     :param extra_lora_options: extra LORA to show in list in gradio
     :param extra_server_options: extra servers to show in list in gradio
+    :param score_model: which model to score responses
+           None: no response scoring
+           'auto': auto mode, '' (no model) for CPU, 'OpenAssistant/reward-model-deberta-v3-large-v2' for GPU,
+            because on CPU takes too much compute just for scoring response
     :param eval_filename: json file to use for evaluation, if None is sharegpt
     :param eval_prompts_only_num: for no gradio benchmark, if using eval_filename prompts for eval instead of examples
     :param eval_prompts_only_seed: for no gradio benchmark, seed for eval_filename sampling
     :param eval_as_output: for no gradio benchmark, whether to test eval_filename output itself
     :param langchain_mode: Data source to include.  Choose "UserData" to only consume files from make_db.py.
+           None: auto mode, check if langchain package exists, at least do LLM if so, else Disabled
            WARNING: wiki_full requires extra data processing via read_wiki_full.py and requires really good workstation to generate db, unless already present.
     :param langchain_action: Mode langchain operations in on documents.
             Query: Make query of document(s)
             Summarize or Summarize_map_reduce: Summarize document(s) via map_reduce
             Summarize_all: Summarize document(s) using entire document at once
             Summarize_refine: Summarize document(s) using entire document, and try to refine before returning summary
+    :param langchain_agents: Which agents to use
+            'search': Use Web Search as context for LLM response, e.g. SERP if have SERPAPI_API_KEY in env
     :param force_langchain_evaluate: Whether to force langchain LLM use even if not doing langchain, mostly for testing.
     :param user_path: user path to glob from to generate db for vector search, for 'UserData' langchain mode.
            If already have db, any new/changed files are added automatically if path set, does not have to be same path used for prior db sources
+    :param langchain_mode_paths: dict of langchain_mode keys and disk path values to use for source of documents
+           E.g. "{'UserData2': 'userpath2'}"
+           Can be None even if existing DB, to avoid new documents being added from that path, source links that are on disk still work.
+           If user_path is not None, that path is used for 'UserData' instead of the value in this dict
     :param detect_user_path_changes_every_query: whether to detect if any files changed or added every similarity search (by file hashes).
            Expensive for large number of files, so not done by default.  By default only detect changes during db loading.
+    :param langchain_modes: names of collections/dbs to potentially have
     :param visible_langchain_modes: dbs to generate at launch to be ready for LLM
            Can be up to ['wiki', 'wiki_full', 'UserData', 'MyData', 'github h2oGPT', 'DriverlessAI docs']
            But wiki_full is expensive and requires preparation
            To allow scratch space only live in session, add 'MyData' to list
            Default: If only want to consume local files, e.g. prepared by make_db.py, only include ['UserData']
+           If have own user modes, need to add these here or add in UI.
+           A state file is stored in visible_langchain_modes.pkl containing last UI-selected values of:
+              langchain_modes, visible_langchain_modes, and langchain_mode_paths
+              Delete the file if you want to start fresh,
+              but in any case the user_path passed in CLI is used for UserData even if was None or different
     :param visible_langchain_actions: Which actions to allow
+    :param visible_langchain_agents: Which agents to allow
     :param document_subset: Default document choice when taking subset of collection
+    :param document_choice: Chosen document(s) by internal name, 'All' means use all docs
+    :param use_llm_if_no_docs: Whether to use LLM even if no documents, when langchain_mode=UserData or MyData or custom
     :param load_db_if_exists: Whether to load chroma db if exists or re-generate db
     :param keep_sources_in_context: Whether to keep url sources in context, not helpful usually
     :param db_type: 'faiss' for in-memory or 'chroma' or 'weaviate' for persisted on disk
     :param use_openai_embedding: Whether to use OpenAI embeddings for vector db
     :param use_openai_model: Whether to use OpenAI model for use with vector db
     :param hf_embedding_model: Which HF embedding model to use for vector db
+           Default is instructor-large with 768 parameters per embedding if have GPUs, else all-MiniLM-L6-v2 if no GPUs
            Can also choose simpler model with 384 parameters per embedding: "sentence-transformers/all-MiniLM-L6-v2"
            Can also choose even better embedding with 1024 parameters: 'hkunlp/instructor-xl'
            We support automatically changing of embeddings for chroma, with a backup of db made if this is done
+    :param cut_distance: Distance to cut off references with larger distances when showing references.
+           1.64 is good to avoid dropping references for all-MiniLM-L6-v2, but instructor-large will always show excessive references.
+           For all-MiniLM-L6-v2, a value of 1.5 can push out even more references, or a large value of 100 can avoid any loss of references.
+    :param add_chat_history_to_context: Include chat context when performing action
+           Not supported yet for openai_chat when using document collection instead of LLM
+           Also not supported when using CLI mode
+    :param allow_upload_to_user_data: Whether to allow file uploads to update shared vector db (UserData or custom user dbs)
+    :param reload_langchain_state: Whether to reload visible_langchain_modes.pkl file that contains any new user collections.
     :param allow_upload_to_my_data: Whether to allow file uploads to update scratch vector db
     :param enable_url_upload: Whether to allow upload from URL
     :param enable_text_upload: Whether to allow upload of text
     :param enable_sources_list: Whether to allow list (or download for non-shared db) of list of sources for chosen db
     :param chunk: Whether to chunk data (True unless know data is already optimally chunked)
+    :param chunk_size: Size of chunks, with typically top-4 passed to LLM, so needs to be in context length
     :param top_k_docs: number of chunks to give LLM
     :param reverse_docs: whether to reverse docs order so most relevant is closest to question.
            Best choice for sufficiently smart model, and truncation occurs for oldest context, so best then too.
            captions_model: str = "Salesforce/blip2-flan-t5-xl",   # question/answer capable, 16GB state
            captions_model: str = "Salesforce/blip2-flan-t5-xxl",  # question/answer capable, 60GB state
            Note: opt-based blip2 are not permissive license due to opt and Meta license restrictions
+           Disabled for CPU since BLIP requires CUDA
     :param pre_load_caption_model: Whether to preload caption model, or load after forking parallel doc loader
            parallel loading disabled if preload and have images, to prevent deadlocking on cuda context
            Recommended if using larger caption model
     :param caption_gpu: If support caption, then use GPU if exists
     :param enable_ocr: Whether to support OCR on images
+    :param enable_pdf_ocr: 'auto' means only use OCR if normal text extraction fails.  Useful for pure image-based PDFs with text
+                            'on' means always do OCR as additional parsing of same documents
+                            'off' means don't do OCR (e.g. because it's slow even if 'auto' only would trigger if nothing else worked)
     :return:
     """
     if base_model is None:
         if langchain_mode is not None:
             visible_langchain_modes += [langchain_mode]
+    # update
+    if isinstance(langchain_mode_paths, str):
+        langchain_mode_paths = ast.literal_eval(langchain_mode_paths)
+        assert isinstance(langchain_mode_paths, dict)
+    if user_path:
+        langchain_mode_paths['UserData'] = user_path
+        makedirs(user_path)
+    if is_public:
+        allow_upload_to_user_data = False
+        if LangChainMode.USER_DATA.value in visible_langchain_modes:
+            visible_langchain_modes.remove(LangChainMode.USER_DATA.value)
+    # in-place, for non-scratch dbs
+    if allow_upload_to_user_data:
+        update_langchain(langchain_modes, visible_langchain_modes, langchain_mode_paths, '')
+        # always listen to CLI-passed user_path if passed
+        if user_path:
+            langchain_mode_paths['UserData'] = user_path
     assert langchain_action in langchain_actions, "Invalid langchain_action %s" % langchain_action
+    assert len(
+        set(langchain_agents).difference(langchain_agents_list)) == 0, "Invalid langchain_agents %s" % langchain_agents
     # if specifically chose not to show My or User Data, disable upload, so gradio elements are simpler
     if LangChainMode.MY_DATA.value not in visible_langchain_modes:
     # auto-set langchain_mode
     if have_langchain and langchain_mode is None:
         # start in chat mode, in case just want to chat and don't want to get "No documents to query" by default.
+        langchain_mode = LangChainMode.LLM.value
+        if allow_upload_to_user_data and not is_public and langchain_mode_paths['UserData']:
             print("Auto set langchain_mode=%s.  Could use UserData instead." % langchain_mode, flush=True)
         elif allow_upload_to_my_data:
             print("Auto set langchain_mode=%s.  Could use MyData instead."
                   "  To allow UserData to pull files from disk,"
+                  " set user_path or langchain_mode_paths, and ensure allow_upload_to_user_data=True" % langchain_mode,
+                  flush=True)
         else:
             raise RuntimeError("Please pass --langchain_mode=<chosen mode> out of %s" % langchain_modes)
+    if not have_langchain and langchain_mode not in [None, LangChainMode.DISABLED.value, LangChainMode.LLM.value]:
         raise RuntimeError("Asked for LangChain mode but langchain python package cannot be found.")
     if langchain_mode is None:
         # if not set yet, disable
         langchain_mode = LangChainMode.DISABLED.value
+        print("Auto set langchain_mode=%s  Have langchain package: %s" % (langchain_mode, have_langchain), flush=True)
     if is_public:
         allow_upload_to_user_data = False
         # HF accounted for later in get_max_max_new_tokens()
     save_dir = os.getenv('SAVE_DIR', save_dir)
     score_model = os.getenv('SCORE_MODEL', score_model)
+    if str(score_model) == 'None':
         score_model = ''
     concurrency_count = int(os.getenv('CONCURRENCY_COUNT', concurrency_count))
     api_open = bool(int(os.getenv('API_OPEN', str(int(api_open)))))
     n_gpus = torch.cuda.device_count() if torch.cuda.is_available else 0
     if n_gpus == 0:
+        enable_captions = False
         gpu_id = None
         load_8bit = False
         load_4bit = False
         if hf_embedding_model is None:
             # if no GPUs, use simpler embedding model to avoid cost in time
             hf_embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
+        if score_model == 'auto':
+            score_model = ''
     else:
+        if score_model == 'auto':
+            score_model = 'OpenAssistant/reward-model-deberta-v3-large-v2'
         if hf_embedding_model is None:
             # if still None, then set default
             hf_embedding_model = 'hkunlp/instructor-large'
     if offload_folder:
         makedirs(offload_folder)
     placeholder_instruction, placeholder_input, \
         stream_output, show_examples, \
                             verbose,
                             )
+    git_hash = get_githash() if is_public or os.getenv('GET_GITHASH') else "GET_GITHASH"
     locals_dict = locals()
     locals_print = '\n'.join(['%s: %s' % (k, v) for k, v in locals_dict.items()])
     if verbose:
             get_some_dbs_from_hf()
         dbs = {}
         for langchain_mode1 in visible_langchain_modes:
+            if langchain_mode1 in ['MyData']:  # FIXME: Remove other custom temp dbs
                 # don't use what is on disk, remove it instead
                 for gpath1 in glob.glob(os.path.join(scratch_base_dir, 'db_dir_%s*' % langchain_mode1)):
                     if os.path.isdir(gpath1):
                 db = prep_langchain(persist_directory1,
                                     load_db_if_exists,
                                     db_type, use_openai_embedding,
+                                    langchain_mode1, langchain_mode_paths,
                                     hf_embedding_model,
                                     kwargs_make_db=locals())
             finally:
     model_state_none = dict(model=None, tokenizer=None, device=None,
                             base_model=None, tokenizer_base_model=None, lora_weights=None,
                             inference_server=None, prompt_type=None, prompt_dict=None)
+    my_db_state0 = {LangChainMode.MY_DATA.value: [None, None]}
+    selection_docs_state0 = dict(visible_langchain_modes=visible_langchain_modes,
+                                 langchain_mode_paths=langchain_mode_paths,
+                                 langchain_modes=langchain_modes)
+    selection_docs_state = selection_docs_state0
+    langchain_modes0 = langchain_modes
+    langchain_mode_paths0 = langchain_mode_paths
+    visible_langchain_modes0 = visible_langchain_modes
     if cli:
         from cli import run_cli
         client = gr_client or hf_client
         # Don't return None, None for model, tokenizer so triggers
         return client, tokenizer, 'http'
+    if isinstance(inference_server, str) and (
+            inference_server.startswith('openai') or inference_server.startswith('vllm')):
+        if inference_server.startswith('openai'):
+            assert os.getenv('OPENAI_API_KEY'), "Set environment for OPENAI_API_KEY"
+            # Don't return None, None for model, tokenizer so triggers
+            # include small token cushion
+            tokenizer = FakeTokenizer(model_max_length=model_token_mapping[base_model] - 50)
         return inference_server, tokenizer, inference_server
     assert not inference_server, "Malformed inference_server=%s" % inference_server
     if base_model in non_hf_types:
 def evaluate(
         model_state,
         my_db_state,
+        selection_docs_state,
         # START NOTE: Examples must have same order of parameters
         instruction,
         iinput,
         instruction_nochat,
         iinput_nochat,
         langchain_mode,
+        add_chat_history_to_context,
         langchain_action,
+        langchain_agents,
         top_k_docs,
         chunk,
         chunk_size,
         save_dir=None,
         sanitize_bot_response=False,
         model_state0=None,
+        langchain_modes0=None,
+        langchain_mode_paths0=None,
+        visible_langchain_modes0=None,
         memory_restriction_level=None,
         max_max_new_tokens=None,
         is_public=None,
         raise_generate_gpu_exceptions=None,
         chat_context=None,
         lora_weights=None,
+        use_llm_if_no_docs=False,
         load_db_if_exists=True,
         dbs=None,
         detect_user_path_changes_every_query=None,
         use_openai_embedding=None,
         use_openai_model=None,
         hf_embedding_model=None,
+        cut_distance=None,
         db_type=None,
         n_jobs=None,
         first_para=None,
     assert chunk_size is not None and isinstance(chunk_size, int)
     assert n_jobs is not None
     assert first_para is not None
+    assert isinstance(add_chat_history_to_context, bool)
+    if selection_docs_state is not None:
+        langchain_modes = selection_docs_state.get('langchain_modes', langchain_modes0)
+        langchain_mode_paths = selection_docs_state.get('langchain_mode_paths', langchain_mode_paths0)
+        visible_langchain_modes = selection_docs_state.get('visible_langchain_modes', visible_langchain_modes0)
+    else:
+        langchain_modes = langchain_modes0
+        langchain_mode_paths = langchain_mode_paths0
+        visible_langchain_modes = visible_langchain_modes0
     if debug:
         locals_dict = locals().copy()
     # THIRD PLACE where LangChain referenced, but imports only occur if enabled and have db to use
     assert langchain_mode in langchain_modes, "Invalid langchain_mode %s" % langchain_mode
     assert langchain_action in langchain_actions, "Invalid langchain_action %s" % langchain_action
+    assert len(
+        set(langchain_agents).difference(langchain_agents_list)) == 0, "Invalid langchain_agents %s" % langchain_agents
+    if dbs is not None and langchain_mode in dbs:
+        db = dbs[langchain_mode]
+    elif my_db_state is not None and langchain_mode in my_db_state:
+        db1 = my_db_state[langchain_mode]
+        if db1 is not None and len(db1) == 2:
+            db = db1[0]
+        else:
+            db = None
     else:
+        db = None
+    do_langchain_path = langchain_mode not in [False, 'Disabled', 'LLM'] or \
                         base_model in non_hf_types or \
                         force_langchain_evaluate
     if do_langchain_path:
         outr = ""
+        # use smaller cut_distance for wiki_full since so many matches could be obtained, and often irrelevant unless close
         from gpt_langchain import run_qa_db
         gen_hyper_langchain = dict(do_sample=do_sample,
                                    temperature=temperature,
                            inference_server=inference_server,
                            stream_output=stream_output,
                            prompter=prompter,
+                           use_llm_if_no_docs=use_llm_if_no_docs,
                            load_db_if_exists=load_db_if_exists,
+                           db=db,
+                           langchain_mode_paths=langchain_mode_paths,
                            detect_user_path_changes_every_query=detect_user_path_changes_every_query,
+                           cut_distance=1.1 if langchain_mode in ['wiki_full'] else cut_distance,
+                           add_chat_history_to_context=add_chat_history_to_context,
                            use_openai_embedding=use_openai_embedding,
                            use_openai_model=use_openai_model,
                            hf_embedding_model=hf_embedding_model,
                            chunk_size=chunk_size,
                            langchain_mode=langchain_mode,
                            langchain_action=langchain_action,
+                           langchain_agents=langchain_agents,
                            document_subset=document_subset,
                            document_choice=document_choice,
                            db_type=db_type,
                               inference_server=inference_server,
                               langchain_mode=langchain_mode,
                               langchain_action=langchain_action,
+                              langchain_agents=langchain_agents,
                               document_subset=document_subset,
                               document_choice=document_choice,
                               num_prompt_tokens=num_prompt_tokens,
             clear_torch_cache()
             return
+    if inference_server.startswith('vllm') or inference_server.startswith('openai') or inference_server.startswith(
+            'http'):
+        if inference_server.startswith('vllm') or inference_server.startswith('openai'):
             where_from = "openai_client"
+            openai, inf_type = set_openai(inference_server)
             terminate_response = prompter.terminate_response or []
             stop_sequences = list(set(terminate_response + [prompter.PreResponse]))
             stop_sequences = [x for x in stop_sequences if x]
                                      n=num_return_sequences,
                                      presence_penalty=1.07 - repetition_penalty + 0.6,  # so good default
                                      )
+            if inf_type == 'vllm' or inference_server == 'openai':
                 response = openai.Completion.create(
                     model=base_model,
                     prompt=prompt,
                         yield dict(response=prompter.get_response(prompt + text, prompt=prompt,
                                                                   sanitize_bot_response=sanitize_bot_response),
                                    sources='')
+            elif inf_type == 'vllm_chat' or inference_server == 'openai_chat':
+                if inf_type == 'vllm_chat':
+                    raise NotImplementedError('%s not supported by vLLM' % inf_type)
                 response = openai.ChatCompletion.create(
                     model=base_model,
                     messages=[
                 chat_client = False
                 where_from = "gr_client"
                 client_langchain_mode = 'Disabled'
+                client_add_chat_history_to_context = True
                 client_langchain_action = LangChainAction.QUERY.value
+                client_langchain_agents = []
                 gen_server_kwargs = dict(temperature=temperature,
                                          top_p=top_p,
                                          top_k=top_k,
                                      instruction_nochat=gr_prompt if not chat_client else '',
                                      iinput_nochat=gr_iinput,  # only for chat=False
                                      langchain_mode=client_langchain_mode,
+                                     add_chat_history_to_context=client_add_chat_history_to_context,
                                      langchain_action=client_langchain_action,
+                                     langchain_agents=client_langchain_agents,
                                      top_k_docs=top_k_docs,
                                      chunk=chunk,
                                      chunk_size=chunk_size,
+                                     document_subset=DocumentSubset.Relevant.name,
+                                     document_choice=[DocumentChoice.ALL.value],
                                      )
                 api_name = '/submit_nochat_api'  # NOTE: like submit_nochat but stable API for string dict passing
                 if not stream_output:
 inputs_list_names = list(inspect.signature(evaluate).parameters)
+state_names = ['model_state', 'my_db_state', 'selection_docs_state']
 inputs_kwargs_list = [x for x in inputs_list_names if x not in eval_func_param_names + state_names]
     # move to correct position
     for example in examples:
+        example += [chat, '', '', LangChainMode.DISABLED.value, True, LangChainAction.QUERY.value, [],
+                    top_k_docs, chunk, chunk_size, DocumentSubset.Relevant.name, []
                     ]
         # adjust examples if non-chat mode
         if not chat:
                         truncation=True,
                         max_length=max_length_tokenize).to(smodel.device)
     try:
+        score = torch.sigmoid(smodel(**inputs.to(smodel.device)).logits[0].float()).cpu().detach().numpy()[0]
     except torch.cuda.OutOfMemoryError as e:
         print("GPU OOM 3: question: %s answer: %s exception: %s" % (question, answer, str(e)), flush=True)
         del inputs
 def get_model_max_length(model_state):
+    if not isinstance(model_state['tokenizer'], (str, type(None))):
         return model_state['tokenizer'].model_max_length
     else:
         return 2048
 def get_max_max_new_tokens(model_state, **kwargs):
+    if not isinstance(model_state['tokenizer'], (str, type(None))):
         max_max_new_tokens = model_state['tokenizer'].model_max_length
     else:
         max_max_new_tokens = None
     return min_top_k_docs, max_top_k_docs, label_top_k_docs
+def history_to_context(history, langchain_mode1,
+                       add_chat_history_to_context,
+                       prompt_type1, prompt_dict1, chat1, model_max_length1,
                        memory_restriction_level1, keep_sources_in_context1):
     """
     consumes all history up to (but not including) latest history item that is presumed to be an [instruction, None] pair
     :param history:
     :param langchain_mode1:
+    :param add_chat_history_to_context:
     :param prompt_type1:
     :param prompt_dict1:
     :param chat1:
     _, _, _, max_prompt_length = get_cutoffs(memory_restriction_level1,
                                              for_context=True, model_max_length=model_max_length1)
     context1 = ''
+    if max_prompt_length is not None and add_chat_history_to_context:
         context1 = ''
         # - 1 below because current instruction already in history from user()
         for histi in range(0, len(history) - 1):
     return context1
+def update_langchain(langchain_modes, visible_langchain_modes, langchain_mode_paths, extra):
+    # update from saved state on disk
+    langchain_modes_from_file, visible_langchain_modes_from_file, langchain_mode_paths_from_file = \
+        load_collection_enum(extra)
+    visible_langchain_modes_temp = visible_langchain_modes.copy() + visible_langchain_modes_from_file
+    visible_langchain_modes.clear()  # don't lose original reference
+    [visible_langchain_modes.append(x) for x in visible_langchain_modes_temp if x not in visible_langchain_modes]
+    langchain_mode_paths.update(langchain_mode_paths_from_file)
+    langchain_modes_temp = langchain_modes.copy() + langchain_modes_from_file
+    langchain_modes.clear()  # don't lose original reference
+    [langchain_modes.append(x) for x in langchain_modes_temp if x not in langchain_modes]
 def entrypoint_main():
     """
     Examples:

gpt4all_llm.py CHANGED Viewed

@@ -95,15 +95,17 @@ def get_llm_gpt4all(model_name,
                     streaming=False,
                     callbacks=None,
                     prompter=None,
                     verbose=False,
                     ):
     assert prompter is not None
     env_gpt4all_file = ".env_gpt4all"
     env_kwargs = dotenv_values(env_gpt4all_file)
-    n_ctx = env_kwargs.pop('n_ctx', 2048 - max_new_tokens)
     default_kwargs = dict(context_erase=0.5,
                           n_batch=1,
-                          n_ctx=n_ctx,
                           n_predict=max_new_tokens,
                           repeat_last_n=64 if repetition_penalty != 1.0 else 0,
                           repeat_penalty=repetition_penalty,
@@ -117,7 +119,8 @@ def get_llm_gpt4all(model_name,
         cls = H2OLlamaCpp
         model_path = env_kwargs.pop('model_path_llama') if model is None else model
         model_kwargs = get_model_kwargs(env_kwargs, default_kwargs, cls, exclude_list=['lc_kwargs'])
-        model_kwargs.update(dict(model_path=model_path, callbacks=callbacks, streaming=streaming, prompter=prompter))
         llm = cls(**model_kwargs)
         llm.client.verbose = verbose
     elif model_name == 'gpt4all_llama':
@@ -125,14 +128,16 @@ def get_llm_gpt4all(model_name,
         model_path = env_kwargs.pop('model_path_gpt4all_llama') if model is None else model
         model_kwargs = get_model_kwargs(env_kwargs, default_kwargs, cls, exclude_list=['lc_kwargs'])
         model_kwargs.update(
-            dict(model=model_path, backend='llama', callbacks=callbacks, streaming=streaming, prompter=prompter))
         llm = cls(**model_kwargs)
     elif model_name == 'gptj':
         cls = H2OGPT4All
         model_path = env_kwargs.pop('model_path_gptj') if model is None else model
         model_kwargs = get_model_kwargs(env_kwargs, default_kwargs, cls, exclude_list=['lc_kwargs'])
         model_kwargs.update(
-            dict(model=model_path, backend='gptj', callbacks=callbacks, streaming=streaming, prompter=prompter))
         llm = cls(**model_kwargs)
     else:
         raise RuntimeError("No such model_name %s" % model_name)
@@ -142,6 +147,8 @@ def get_llm_gpt4all(model_name,
 class H2OGPT4All(gpt4all.GPT4All):
     model: Any
     prompter: Any
     """Path to the pre-trained GPT4All model file."""
     @root_validator()
@@ -187,10 +194,11 @@ class H2OGPT4All(gpt4all.GPT4All):
             **kwargs,
     ) -> str:
         # Roughly 4 chars per token if natural language
-        prompt = prompt[-self.n_ctx * 4:]
         # use instruct prompting
-        data_point = dict(context='', instruction=prompt, input='')
         prompt = self.prompter.generate_prompt(data_point)
         verbose = False
@@ -206,6 +214,8 @@ from langchain.llms import LlamaCpp
 class H2OLlamaCpp(LlamaCpp):
     model_path: Any
     prompter: Any
     """Path to the pre-trained GPT4All model file."""
     @root_validator()
@@ -276,7 +286,7 @@ class H2OLlamaCpp(LlamaCpp):
                 print("reduced tokens from %d -> %d" % (num_prompt_tokens, num_prompt_tokens2), flush=True)
         # use instruct prompting
-        data_point = dict(context='', instruction=prompt, input='')
         prompt = self.prompter.generate_prompt(data_point)
         if verbose:

                     streaming=False,
                     callbacks=None,
                     prompter=None,
+                    context='',
+                    iinput='',
                     verbose=False,
                     ):
     assert prompter is not None
     env_gpt4all_file = ".env_gpt4all"
     env_kwargs = dotenv_values(env_gpt4all_file)
+    max_tokens = env_kwargs.pop('max_tokens', 2048 - max_new_tokens)
     default_kwargs = dict(context_erase=0.5,
                           n_batch=1,
+                          max_tokens=max_tokens,
                           n_predict=max_new_tokens,
                           repeat_last_n=64 if repetition_penalty != 1.0 else 0,
                           repeat_penalty=repetition_penalty,
         cls = H2OLlamaCpp
         model_path = env_kwargs.pop('model_path_llama') if model is None else model
         model_kwargs = get_model_kwargs(env_kwargs, default_kwargs, cls, exclude_list=['lc_kwargs'])
+        model_kwargs.update(dict(model_path=model_path, callbacks=callbacks, streaming=streaming,
+                                 prompter=prompter, context=context, iinput=iinput))
         llm = cls(**model_kwargs)
         llm.client.verbose = verbose
     elif model_name == 'gpt4all_llama':
         model_path = env_kwargs.pop('model_path_gpt4all_llama') if model is None else model
         model_kwargs = get_model_kwargs(env_kwargs, default_kwargs, cls, exclude_list=['lc_kwargs'])
         model_kwargs.update(
+            dict(model=model_path, backend='llama', callbacks=callbacks, streaming=streaming,
+                 prompter=prompter, context=context, iinput=iinput))
         llm = cls(**model_kwargs)
     elif model_name == 'gptj':
         cls = H2OGPT4All
         model_path = env_kwargs.pop('model_path_gptj') if model is None else model
         model_kwargs = get_model_kwargs(env_kwargs, default_kwargs, cls, exclude_list=['lc_kwargs'])
         model_kwargs.update(
+            dict(model=model_path, backend='gptj', callbacks=callbacks, streaming=streaming,
+                 prompter=prompter, context=context, iinput=iinput))
         llm = cls(**model_kwargs)
     else:
         raise RuntimeError("No such model_name %s" % model_name)
 class H2OGPT4All(gpt4all.GPT4All):
     model: Any
     prompter: Any
+    context: Any = ''
+    iinput: Any = ''
     """Path to the pre-trained GPT4All model file."""
     @root_validator()
             **kwargs,
     ) -> str:
         # Roughly 4 chars per token if natural language
+        n_ctx = 2048
+        prompt = prompt[-self.max_tokens * 4:]
         # use instruct prompting
+        data_point = dict(context=self.context, instruction=prompt, input=self.iinput)
         prompt = self.prompter.generate_prompt(data_point)
         verbose = False
 class H2OLlamaCpp(LlamaCpp):
     model_path: Any
     prompter: Any
+    context: Any
+    iinput: Any
     """Path to the pre-trained GPT4All model file."""
     @root_validator()
                 print("reduced tokens from %d -> %d" % (num_prompt_tokens, num_prompt_tokens2), flush=True)
         # use instruct prompting
+        data_point = dict(context=self.context, instruction=prompt, input=self.iinput)
         prompt = self.prompter.generate_prompt(data_point)
         if verbose:

gpt_langchain.py CHANGED Viewed

@@ -21,16 +21,17 @@ import filelock
 from joblib import delayed
 from langchain.callbacks import streaming_stdout
 from langchain.embeddings import HuggingFaceInstructEmbeddings
 from tqdm import tqdm
-from enums import DocumentChoices, no_lora_str, model_token_mapping, source_prefix, source_postfix, non_query_commands, \
-    LangChainAction, LangChainMode
 from evaluate_params import gen_hyper
 from gen import get_model, SEED
 from prompter import non_hf_types, PromptType, Prompter
 from utils import wrapped_partial, EThread, import_matplotlib, sanitize_filename, makedirs, get_url, flatten_list, \
     get_device, ProgressParallel, remove, hash_file, clear_torch_cache, NullContext, get_hf_server, FakeTokenizer, \
-    have_libreoffice, have_arxiv, have_playwright, have_selenium, have_tesseract, have_pymupdf
 from utils_langchain import StreamingGradioCallbackHandler
 import_matplotlib()
@@ -95,11 +96,15 @@ def get_db(sources, use_openai_embedding=False, db_type='faiss',
         db = get_existing_db(None, persist_directory, load_db_if_exists, db_type, use_openai_embedding, langchain_mode,
                              hf_embedding_model, verbose=False)
         if db is None:
             db = Chroma.from_documents(documents=sources,
                                        embedding=embedding,
                                        persist_directory=persist_directory,
                                        collection_name=collection_name,
-                                       anonymized_telemetry=False)
             db.persist()
             clear_embedding(db)
             save_embed(db, use_openai_embedding, hf_embedding_model)
@@ -276,15 +281,7 @@ from typing import Any, Dict, List, Optional, Set
 from pydantic import Extra, Field, root_validator
-from langchain.callbacks.manager import CallbackManagerForLLMRun
-"""Wrapper around Huggingface text generation inference API."""
-from functools import partial
-from typing import Any, Dict, List, Optional
-from pydantic import Extra, Field, root_validator
-from langchain.callbacks.manager import CallbackManagerForLLMRun
 from langchain.llms.base import LLM
@@ -312,6 +309,8 @@ class GradioInference(LLM):
     sanitize_bot_response: bool = False
     prompter: Any = None
     client: Any = None
     class Config:
@@ -355,13 +354,15 @@ class GradioInference(LLM):
         stream_output = self.stream
         gr_client = self.client
         client_langchain_mode = 'Disabled'
         client_langchain_action = LangChainAction.QUERY.value
         top_k_docs = 1
         chunk = True
         chunk_size = 512
         client_kwargs = dict(instruction=prompt if self.chat_client else '',  # only for chat=True
-                             iinput='',  # only for chat=True
-                             context='',
                              # streaming output is supported, loops over and outputs each generation in streaming mode
                              # but leave stream_output=False for simple input/output mode
                              stream_output=stream_output,
@@ -382,14 +383,16 @@ class GradioInference(LLM):
                              chat=self.chat_client,
                              instruction_nochat=prompt if not self.chat_client else '',
-                             iinput_nochat='',  # only for chat=False
                              langchain_mode=client_langchain_mode,
                              langchain_action=client_langchain_action,
                              top_k_docs=top_k_docs,
                              chunk=chunk,
                              chunk_size=chunk_size,
-                             document_subset=DocumentChoices.Relevant.name,
-                             document_choice=[],
                              )
         api_name = '/submit_nochat_api'  # NOTE: like submit_nochat but stable API for string dict passing
         if not stream_output:
@@ -459,6 +462,8 @@ class H2OHuggingFaceTextGenInference(HuggingFaceTextGenInference):
     stream: bool = False
     sanitize_bot_response: bool = False
     prompter: Any = None
     tokenizer: Any = None
     client: Any = None
@@ -500,7 +505,7 @@ class H2OHuggingFaceTextGenInference(HuggingFaceTextGenInference):
         prompt, num_prompt_tokens = H2OTextGenerationPipeline.limit_prompt(prompt, self.tokenizer)
         # NOTE: TGI server does not add prompting, so must do here
-        data_point = dict(context='', instruction=prompt, input='')
         prompt = self.prompter.generate_prompt(data_point)
         gen_server_kwargs = dict(do_sample=self.do_sample,
@@ -566,6 +571,94 @@ class H2OHuggingFaceTextGenInference(HuggingFaceTextGenInference):
 from langchain.chat_models import ChatOpenAI
 class H2OChatOpenAI(ChatOpenAI):
@@ -596,17 +689,36 @@ def get_llm(use_openai_model=False,
             prompt_type=None,
             prompt_dict=None,
             prompter=None,
             sanitize_bot_response=False,
             verbose=False,
             ):
-    if use_openai_model or inference_server in ['openai', 'openai_chat']:
         if use_openai_model and model_name is None:
             model_name = "gpt-3.5-turbo"
-        if inference_server == 'openai':
-            from langchain.llms import OpenAI
-            cls = OpenAI
-        else:
             cls = H2OChatOpenAI
         callbacks = [StreamingGradioCallbackHandler()]
         llm = cls(model_name=model_name,
                   temperature=temperature if do_sample else 0,
@@ -616,11 +728,18 @@ def get_llm(use_openai_model=False,
                   frequency_penalty=0,
                   presence_penalty=1.07 - repetition_penalty + 0.6,  # so good default
                   callbacks=callbacks if stream_output else None,
                   )
         streamer = callbacks[0] if stream_output else None
         if inference_server in ['openai', 'openai_chat']:
             prompt_type = inference_server
         else:
             prompt_type = prompt_type or 'plain'
     elif inference_server:
         assert inference_server.startswith(
@@ -669,6 +788,8 @@ def get_llm(use_openai_model=False,
                 callbacks=callbacks if stream_output else None,
                 stream=stream_output,
                 prompter=prompter,
                 client=gr_client,
                 sanitize_bot_response=sanitize_bot_response,
             )
@@ -689,6 +810,8 @@ def get_llm(use_openai_model=False,
                 callbacks=callbacks if stream_output else None,
                 stream=stream_output,
                 prompter=prompter,
                 tokenizer=tokenizer,
                 client=hf_client,
                 timeout=max_time,
@@ -721,6 +844,8 @@ def get_llm(use_openai_model=False,
                               verbose=verbose,
                               streaming=stream_output,
                               prompter=prompter,
                               )
     else:
         if model is None:
@@ -763,6 +888,8 @@ def get_llm(use_openai_model=False,
         from h2oai_pipeline import H2OTextGenerationPipeline
         pipe = H2OTextGenerationPipeline(model=model, use_prompter=True,
                                          prompter=prompter,
                                          prompt_type=prompt_type,
                                          prompt_dict=prompt_dict,
                                          sanitize_bot_response=sanitize_bot_response,
@@ -916,7 +1043,6 @@ def get_dai_docs(from_hf=False, get_pickle=True):
     return sources
 image_types = ["png", "jpg", "jpeg"]
 non_image_types = ["pdf", "txt", "csv", "toml", "py", "rst", "rtf",
                    "md",
@@ -927,7 +1053,8 @@ non_image_types = ["pdf", "txt", "csv", "toml", "py", "rst", "rtf",
                    ]
 # "msg",  GPL3
-if have_libreoffice:
     non_image_types.extend(["docx", "doc", "xls", "xlsx"])
 file_types = non_image_types + image_types
@@ -936,9 +1063,11 @@ file_types = non_image_types + image_types
 def add_meta(docs1, file):
     file_extension = pathlib.Path(file).suffix
     hashid = hash_file(file)
     if not isinstance(docs1, (list, tuple, types.GeneratorType)):
         docs1 = [docs1]
-    [x.metadata.update(dict(input_type=file_extension, date=str(datetime.now()), hashid=hashid)) for x in docs1]
 def file_to_doc(file, base_path=None, verbose=False, fail_any_exception=False,
@@ -946,7 +1075,7 @@ def file_to_doc(file, base_path=None, verbose=False, fail_any_exception=False,
                 is_url=False, is_txt=False,
                 enable_captions=True,
                 captions_model=None,
-                enable_ocr=False, caption_loader=None,
                 headsize=50):
     if file is None:
         if fail_any_exception:
@@ -963,6 +1092,7 @@ def file_to_doc(file, base_path=None, verbose=False, fail_any_exception=False,
         base_name = sanitize_filename(base_name) + "_" + str(uuid.uuid4())[:10]
         base_path = os.path.join(dir_name, base_name)
     if is_url:
         if file.lower().startswith('arxiv:'):
             query = file.lower().split('arxiv:')
             if len(query) == 2 and have_arxiv:
@@ -1011,11 +1141,11 @@ def file_to_doc(file, base_path=None, verbose=False, fail_any_exception=False,
         add_meta(docs1, file)
         docs1 = clean_doc(docs1)
         doc1 = chunk_sources(docs1, chunk=chunk, chunk_size=chunk_size, language=Language.HTML)
-    elif (file.lower().endswith('.docx') or file.lower().endswith('.doc')) and have_libreoffice:
         docs1 = UnstructuredWordDocumentLoader(file_path=file).load()
         add_meta(docs1, file)
         doc1 = chunk_sources(docs1, chunk=chunk, chunk_size=chunk_size)
-    elif (file.lower().endswith('.xlsx') or file.lower().endswith('.xls')) and have_libreoffice:
         docs1 = UnstructuredExcelLoader(file_path=file).load()
         add_meta(docs1, file)
         doc1 = chunk_sources(docs1, chunk=chunk, chunk_size=chunk_size)
@@ -1114,21 +1244,54 @@ def file_to_doc(file, base_path=None, verbose=False, fail_any_exception=False,
         from dotenv import dotenv_values
         env_kwargs = dotenv_values(env_gpt4all_file)
         pdf_class_name = env_kwargs.get('PDF_CLASS_NAME', 'PyMuPDFParser')
         if have_pymupdf and pdf_class_name == 'PyMuPDFParser':
             # GPL, only use if installed
             from langchain.document_loaders import PyMuPDFLoader
             # load() still chunks by pages, but every page has title at start to help
             doc1 = PyMuPDFLoader(file).load()
             doc1 = clean_doc(doc1)
-        elif pdf_class_name == 'UnstructuredPDFLoader':
             doc1 = UnstructuredPDFLoader(file).load()
             # seems to not need cleaning in most cases
-        else:
             # open-source fallback
             # load() still chunks by pages, but every page has title at start to help
             doc1 = PyPDFLoader(file).load()
             doc1 = clean_doc(doc1)
         # Some PDFs return nothing or junk from PDFMinerLoader
         doc1 = chunk_sources(doc1, chunk=chunk, chunk_size=chunk_size)
         add_meta(doc1, file)
     elif file.lower().endswith('.csv'):
@@ -1181,7 +1344,7 @@ def path_to_doc1(file, verbose=False, fail_any_exception=False, return_file=True
                  is_url=False, is_txt=False,
                  enable_captions=True,
                  captions_model=None,
-                 enable_ocr=False, caption_loader=None):
     if verbose:
         if is_url:
             print("Ingesting URL: %s" % file, flush=True)
@@ -1199,6 +1362,7 @@ def path_to_doc1(file, verbose=False, fail_any_exception=False, return_file=True
                           enable_captions=enable_captions,
                           captions_model=captions_model,
                           enable_ocr=enable_ocr,
                           caption_loader=caption_loader)
     except BaseException as e:
         print("Failed to ingest %s due to %s" % (file, traceback.format_exc()))
@@ -1207,7 +1371,7 @@ def path_to_doc1(file, verbose=False, fail_any_exception=False, return_file=True
         else:
             exception_doc = Document(
                 page_content='',
-                metadata={"source": file, "exception": '%s hit %s' % (file, str(e)),
                           "traceback": traceback.format_exc()})
             res = [exception_doc]
     if return_file:
@@ -1228,6 +1392,7 @@ def path_to_docs(path_or_paths, verbose=False, fail_any_exception=False, n_jobs=
                  captions_model=None,
                  caption_loader=None,
                  enable_ocr=False,
                  existing_files=[],
                  existing_hash_ids={},
                  ):
@@ -1249,11 +1414,15 @@ def path_to_docs(path_or_paths, verbose=False, fail_any_exception=False, n_jobs=
         [globs_non_image_types.extend(glob.glob(os.path.join(path, "./**/*.%s" % ftype), recursive=True))
          for ftype in non_image_types]
     else:
-        if isinstance(path_or_paths, str) and (os.path.isfile(path_or_paths) or os.path.isdir(path_or_paths)):
-            path_or_paths = [path_or_paths]
         # list/tuple of files (consume what can, and exception those that selected but cannot consume so user knows)
-        assert isinstance(path_or_paths, (list, tuple, types.GeneratorType)), "Wrong type for path_or_paths: %s" % type(
-            path_or_paths)
         # reform out of allowed types
         globs_image_types.extend(flatten_list([[x for x in path_or_paths if x.endswith(y)] for y in image_types]))
         # could do below:
@@ -1305,6 +1474,7 @@ def path_to_docs(path_or_paths, verbose=False, fail_any_exception=False, n_jobs=
                   captions_model=captions_model,
                   caption_loader=caption_loader,
                   enable_ocr=enable_ocr,
                   )
     if n_jobs != 1 and len(globs_non_image_types) > 1:
@@ -1337,7 +1507,7 @@ def path_to_docs(path_or_paths, verbose=False, fail_any_exception=False, n_jobs=
             with open(fil, 'rb') as f:
                 documents.extend(pickle.load(f))
             # remove temp pickle
-            os.remove(fil)
     else:
         documents = reduce(concat, documents)
     return documents
@@ -1345,7 +1515,7 @@ def path_to_docs(path_or_paths, verbose=False, fail_any_exception=False, n_jobs=
 def prep_langchain(persist_directory,
                    load_db_if_exists,
-                   db_type, use_openai_embedding, langchain_mode, user_path,
                    hf_embedding_model, n_jobs=-1, kwargs_make_db={}):
     """
     do prep first time, involving downloads
@@ -1355,6 +1525,7 @@ def prep_langchain(persist_directory,
     assert langchain_mode not in ['MyData'], "Should not prep scratch data"
     db_dir_exists = os.path.isdir(persist_directory)
     if db_dir_exists and user_path is None:
         print("Prep: persist_directory=%s exists, using" % persist_directory, flush=True)
@@ -1490,7 +1661,7 @@ def make_db(**langchain_kwargs):
             langchain_kwargs[k] = defaults_db[k]
     # final check for missing
     missing_kwargs = [x for x in func_names if x not in langchain_kwargs]
-    assert not missing_kwargs, "Missing kwargs: %s" % missing_kwargs
     # only keep actual used
     langchain_kwargs = {k: v for k, v in langchain_kwargs.items() if k in func_names}
     return _make_db(**langchain_kwargs)
@@ -1524,13 +1695,14 @@ def _make_db(use_openai_embedding=False,
              first_para=False, text_limit=None,
              chunk=True, chunk_size=512,
              langchain_mode=None,
-             user_path=None,
              db_type='faiss',
              load_db_if_exists=True,
              db=None,
              n_jobs=-1,
              verbose=False):
     persist_directory = get_persist_directory(langchain_mode)
     # see if can get persistent chroma db
     db_trial = get_existing_db(db, persist_directory, load_db_if_exists, db_type, use_openai_embedding, langchain_mode,
                                hf_embedding_model, verbose=verbose)
@@ -1538,23 +1710,8 @@ def _make_db(use_openai_embedding=False,
         db = db_trial
     sources = []
-    if not db and langchain_mode not in ['MyData'] or \
-            user_path is not None and \
-            langchain_mode in ['UserData']:
-        # Should not make MyData db this way, why avoided, only upload from UI
-        assert langchain_mode not in ['MyData'], "Should not make MyData db this way"
-        if verbose:
-            if langchain_mode in ['UserData']:
-                if user_path is not None:
-                    print("Checking if changed or new sources in %s, and generating sources them" % user_path,
-                          flush=True)
-                elif db is None:
-                    print("user_path not passed and no db, no sources", flush=True)
-                else:
-                    print("user_path not passed, using only existing db, no new sources", flush=True)
-            else:
-                print("Generating %s sources" % langchain_mode, flush=True)
-        if langchain_mode in ['wiki_full', 'All', "'All'"]:
             from read_wiki_full import get_all_documents
             small_test = None
             print("Generating new wiki", flush=True)
@@ -1564,55 +1721,48 @@ def _make_db(use_openai_embedding=False,
                 sources1 = chunk_sources(sources1, chunk=chunk, chunk_size=chunk_size)
                 print("Chunked new wiki", flush=True)
             sources.extend(sources1)
-        if langchain_mode in ['wiki', 'All', "'All'"]:
             sources1 = get_wiki_sources(first_para=first_para, text_limit=text_limit)
             if chunk:
                 sources1 = chunk_sources(sources1, chunk=chunk, chunk_size=chunk_size)
             sources.extend(sources1)
-        if langchain_mode in ['github h2oGPT', 'All', "'All'"]:
             # sources = get_github_docs("dagster-io", "dagster")
             sources1 = get_github_docs("h2oai", "h2ogpt")
             # FIXME: always chunk for now
             sources1 = chunk_sources(sources1, chunk=chunk, chunk_size=chunk_size)
             sources.extend(sources1)
-        if langchain_mode in ['DriverlessAI docs', 'All', "'All'"]:
             sources1 = get_dai_docs(from_hf=True)
             if chunk and False:  # FIXME: DAI docs are already chunked well, should only chunk more if over limit
                 sources1 = chunk_sources(sources1, chunk=chunk, chunk_size=chunk_size)
             sources.extend(sources1)
-        if langchain_mode in ['All', 'UserData']:
-            if user_path:
-                if db is not None:
-                    # NOTE: Ignore file names for now, only go by hash ids
-                    # existing_files = get_existing_files(db)
-                    existing_files = []
-                    existing_hash_ids = get_existing_hash_ids(db)
-                else:
-                    # pretend no existing files so won't filter
-                    existing_files = []
-                    existing_hash_ids = []
-                # chunk internally for speed over multiple docs
-                # FIXME: If first had old Hash=None and switch embeddings,
-                #  then re-embed, and then hit here and reload so have hash, and then re-embed.
-                sources1 = path_to_docs(user_path, n_jobs=n_jobs, chunk=chunk, chunk_size=chunk_size,
-                                        existing_files=existing_files, existing_hash_ids=existing_hash_ids)
-                new_metadata_sources = set([x.metadata['source'] for x in sources1])
-                if new_metadata_sources:
-                    print("Loaded %s new files as sources to add to UserData" % len(new_metadata_sources), flush=True)
-                    if verbose:
-                        print("Files added: %s" % '\n'.join(new_metadata_sources), flush=True)
-                sources.extend(sources1)
-                print("Loaded %s sources for potentially adding to UserData" % len(sources), flush=True)
-            else:
-                print("Chose UserData but user_path is empty/None", flush=True)
-        if False and langchain_mode in ['urls', 'All', "'All'"]:
-            # from langchain.document_loaders import UnstructuredURLLoader
-            # loader = UnstructuredURLLoader(urls=urls)
-            urls = ["https://www.birdsongsf.com/who-we-are/"]
-            from langchain.document_loaders import PlaywrightURLLoader
-            loader = PlaywrightURLLoader(urls=urls, remove_selectors=["header", "footer"])
-            sources1 = loader.load()
-            sources.extend(sources1)
         if not sources:
             if verbose:
                 if db is not None:
@@ -1635,7 +1785,7 @@ def _make_db(use_openai_embedding=False,
         else:
             print("Did not generate db since no sources", flush=True)
         new_sources_metadata = [x.metadata for x in sources]
-    elif user_path is not None and langchain_mode in ['UserData']:
         print("Existing db, potentially adding %s sources from user_path=%s" % (len(sources), user_path), flush=True)
         db, num_new_sources, new_sources_metadata = add_to_db(db, sources, db_type=db_type,
                                                               use_openai_embedding=use_openai_embedding,
@@ -1733,7 +1883,7 @@ def run_qa_db(**kwargs):
     kwargs['answer_with_sources'] = True
     kwargs['show_rank'] = False
     missing_kwargs = [x for x in func_names if x not in kwargs]
-    assert not missing_kwargs, "Missing kwargs: %s" % missing_kwargs
     # only keep actual used
     kwargs = {k: v for k, v in kwargs.items() if k in func_names}
     try:
@@ -1747,7 +1897,7 @@ def _run_qa_db(query=None,
                context=None,
                use_openai_model=False, use_openai_embedding=False,
                first_para=False, text_limit=None, top_k_docs=4, chunk=True, chunk_size=512,
-               user_path=None,
                detect_user_path_changes_every_query=False,
                db_type='faiss',
                model_name=None, model=None, tokenizer=None, inference_server=None,
@@ -1757,9 +1907,11 @@ def _run_qa_db(query=None,
                prompt_type=None,
                prompt_dict=None,
                answer_with_sources=True,
-               cut_distanct=1.1,
                sanitize_bot_response=False,
                show_rank=False,
                load_db_if_exists=False,
                db=None,
                do_sample=False,
@@ -1775,8 +1927,9 @@ def _run_qa_db(query=None,
                num_return_sequences=1,
                langchain_mode=None,
                langchain_action=None,
-               document_subset=DocumentChoices.Relevant.name,
-               document_choice=[],
                n_jobs=-1,
                verbose=False,
                cli=False,
@@ -1795,7 +1948,7 @@ def _run_qa_db(query=None,
     :param top_k_docs:
     :param chunk:
     :param chunk_size:
-    :param user_path: user path to glob recursively from
     :param db_type: 'faiss' for in-memory db or 'chroma' or 'weaviate' for persistent db
     :param model_name: model name, used to switch behaviors
     :param model: pre-initialized model, else will make new one
@@ -1803,6 +1956,7 @@ def _run_qa_db(query=None,
     :param answer_with_sources
     :return:
     """
     if model is not None:
         assert model_name is not None  # require so can make decisions
     assert query is not None
@@ -1817,6 +1971,8 @@ def _run_qa_db(query=None,
         else:
             prompt_dict = ''
     assert len(set(gen_hyper).difference(inspect.signature(get_llm).parameters)) == 0
     llm, model_name, streamer, prompt_type_out = get_llm(use_openai_model=use_openai_model, model_name=model_name,
                                                          model=model,
                                                          tokenizer=tokenizer,
@@ -1836,11 +1992,13 @@ def _run_qa_db(query=None,
                                                          prompt_type=prompt_type,
                                                          prompt_dict=prompt_dict,
                                                          prompter=prompter,
                                                          sanitize_bot_response=sanitize_bot_response,
                                                          verbose=verbose,
                                                          )
-    use_context = False
     scores = []
     chain = None
@@ -1852,25 +2010,29 @@ def _run_qa_db(query=None,
     sim_kwargs = {k: v for k, v in locals().items() if k in func_names}
     missing_kwargs = [x for x in func_names if x not in sim_kwargs]
     assert not missing_kwargs, "Missing: %s" % missing_kwargs
-    docs, chain, scores, use_context, have_any_docs = get_chain(**sim_kwargs)
     if document_subset in non_query_commands:
         formatted_doc_chunks = '\n\n'.join([get_url(x) + '\n\n' + x.page_content for x in docs])
         yield formatted_doc_chunks, ''
         return
-    if not docs and langchain_action in [LangChainAction.SUMMARIZE_MAP.value,
-                                         LangChainAction.SUMMARIZE_ALL.value,
-                                         LangChainAction.SUMMARIZE_REFINE.value]:
-        ret = 'No relevant documents to summarize.' if have_any_docs else 'No documents to summarize.'
-        extra = ''
-        yield ret, extra
-        return
-    if not docs and langchain_mode not in [LangChainMode.DISABLED.value,
-                                           LangChainMode.CHAT_LLM.value,
-                                           LangChainMode.LLM.value]:
-        ret = 'No relevant documents to query.' if have_any_docs else 'No documents to query.'
-        extra = ''
-        yield ret, extra
-        return
     if chain is None and model_name not in non_hf_types:
         # here if no docs at all and not HF type
@@ -1921,7 +2083,7 @@ def _run_qa_db(query=None,
             else:
                 answer = chain()
-    if not use_context:
         ret = answer['output_text']
         extra = ''
         yield ret, extra
@@ -1933,9 +2095,10 @@ def _run_qa_db(query=None,
 def get_chain(query=None,
               iinput=None,
               use_openai_model=False, use_openai_embedding=False,
               first_para=False, text_limit=None, top_k_docs=4, chunk=True, chunk_size=512,
-              user_path=None,
               detect_user_path_changes_every_query=False,
               db_type='faiss',
               model_name=None,
@@ -1943,13 +2106,15 @@ def get_chain(query=None,
               hf_embedding_model="sentence-transformers/all-MiniLM-L6-v2",
               prompt_type=None,
               prompt_dict=None,
-              cut_distanct=1.1,
               load_db_if_exists=False,
               db=None,
               langchain_mode=None,
               langchain_action=None,
-              document_subset=DocumentChoices.Relevant.name,
-              document_choice=[],
               n_jobs=-1,
               # beyond run_db_query:
               llm=None,
@@ -1961,14 +2126,15 @@ def get_chain(query=None,
               auto_reduce_chunks=True,
               max_chunks=100,
               ):
     # determine whether use of context out of docs is planned
     if not use_openai_model and prompt_type not in ['plain'] or model_name in non_hf_types:
-        if langchain_mode in ['Disabled', 'ChatLLM', 'LLM']:
-            use_context = False
         else:
-            use_context = True
     else:
-        use_context = True
     # https://github.com/hwchase17/langchain/issues/1946
     # FIXME: Seems to way to get size of chroma db to limit top_k_docs to avoid
@@ -1985,14 +2151,17 @@ def get_chain(query=None,
         # avoid looking at user_path during similarity search db handling,
         # if already have db and not updating from user_path every query
         # but if db is None, no db yet loaded (e.g. from prep), so allow user_path to be whatever it was
-        user_path = None
     db, num_new_sources, new_sources_metadata = make_db(use_openai_embedding=use_openai_embedding,
                                                         hf_embedding_model=hf_embedding_model,
                                                         first_para=first_para, text_limit=text_limit,
                                                         chunk=chunk,
                                                         chunk_size=chunk_size,
                                                         langchain_mode=langchain_mode,
-                                                        user_path=user_path,
                                                         db_type=db_type,
                                                         load_db_if_exists=load_db_if_exists,
                                                         db=db,
@@ -2012,7 +2181,7 @@ def get_chain(query=None,
         else:
             extra = ""
             prefix = ""
-        if langchain_mode in ['Disabled', 'ChatLLM', 'LLM'] or not use_context:
             template_if_no_docs = template = """%s{context}{question}""" % prefix
         else:
             template = """%s
@@ -2053,7 +2222,7 @@ def get_chain(query=None,
     else:
         use_template = False
-    if db and use_context:
         base_path = 'locks'
         makedirs(base_path)
         if hasattr(db, '_persist_directory'):
@@ -2067,10 +2236,10 @@ def get_chain(query=None,
             filter_kwargs = {}
         else:
             assert document_choice is not None, "Document choice was None"
-            if len(document_choice) >= 1 and document_choice[0] == DocumentChoices.All.name:
                 filter_kwargs = {}
             elif len(document_choice) >= 2:
-                if document_choice[0] == DocumentChoices.All.name:
                     # remove 'All'
                     document_choice = document_choice[1:]
                 or_filter = [{"source": {"$eq": x}} for x in document_choice]
@@ -2082,18 +2251,18 @@ def get_chain(query=None,
             else:
                 # shouldn't reach
                 filter_kwargs = {}
-        if langchain_mode in [LangChainMode.LLM.value, LangChainMode.CHAT_LLM.value]:
             docs = []
             scores = []
-        elif document_subset == DocumentChoices.All.name or query in [None, '', '\n']:
             db_documents, db_metadatas = get_docs_and_meta(db, top_k_docs, filter_kwargs=filter_kwargs)
             # similar to langchain's chroma's _results_to_docs_and_scores
             docs_with_score = [(Document(page_content=result[0], metadata=result[1] or {}), 0)
                                for result in zip(db_documents, db_metadatas)]
             # order documents
-            doc_hashes = [x['doc_hash'] for x in db_metadatas]
-            doc_chunk_ids = [x['chunk_id'] for x in db_metadatas]
             docs_with_score = [x for _, _, x in
                                sorted(zip(doc_hashes, doc_chunk_ids, docs_with_score), key=lambda x: (x[0], x[1]))
                                ]
@@ -2173,8 +2342,8 @@ def get_chain(query=None,
                 docs_with_score.reverse()
             # cut off so no high distance docs/sources considered
             have_any_docs |= len(docs_with_score) > 0  # before cut
-            docs = [x[0] for x in docs_with_score if x[1] < cut_distanct]
-            scores = [x[1] for x in docs_with_score if x[1] < cut_distanct]
             if len(scores) > 0 and verbose:
                 print("Distance: min: %s max: %s mean: %s median: %s" %
                       (scores[0], scores[-1], np.mean(scores), np.median(scores)), flush=True)
@@ -2182,7 +2351,7 @@ def get_chain(query=None,
         docs = []
         scores = []
-    if not docs and use_context and model_name not in non_hf_types:
         # if HF type and have no docs, can bail out
         return docs, None, [], False, have_any_docs
@@ -2205,7 +2374,7 @@ def get_chain(query=None,
     if len(docs) == 0:
         # avoid context == in prompt then
-        use_context = False
         template = template_if_no_docs
     if langchain_action == LangChainAction.QUERY.value:
@@ -2221,7 +2390,7 @@ def get_chain(query=None,
         else:
             # only if use_openai_model = True, unused normally except in testing
             chain = load_qa_with_sources_chain(llm)
-        if not use_context:
             chain_kwargs = dict(input_documents=[], question=query)
         else:
             chain_kwargs = dict(input_documents=docs, question=query)
@@ -2248,7 +2417,7 @@ def get_chain(query=None,
     else:
         raise RuntimeError("No such langchain_action=%s" % langchain_action)
-    return docs, target, scores, use_context, have_any_docs
 def get_sources_answer(query, answer, scores, show_rank, answer_with_sources, verbose=False):
@@ -2302,6 +2471,7 @@ def clean_doc(docs1):
 def chunk_sources(sources, chunk=True, chunk_size=512, language=None):
     if not chunk:
         return sources
     if not isinstance(sources, (list, tuple, types.GeneratorType)) and not callable(sources):
         # if just one document
@@ -2320,8 +2490,7 @@ def chunk_sources(sources, chunk=True, chunk_size=512, language=None):
     source_chunks = splitter.split_documents(sources)
     # currently in order, but when pull from db won't be, so mark order and document by hash
-    doc_hash = str(uuid.uuid4())[:10]
-    [x.metadata.update(dict(doc_hash=doc_hash, chunk_id=chunk_id)) for chunk_id, x in enumerate(source_chunks)]
     return source_chunks

 from joblib import delayed
 from langchain.callbacks import streaming_stdout
 from langchain.embeddings import HuggingFaceInstructEmbeddings
+from langchain.schema import LLMResult
 from tqdm import tqdm
+from enums import DocumentSubset, no_lora_str, model_token_mapping, source_prefix, source_postfix, non_query_commands, \
+    LangChainAction, LangChainMode, DocumentChoice
 from evaluate_params import gen_hyper
 from gen import get_model, SEED
 from prompter import non_hf_types, PromptType, Prompter
 from utils import wrapped_partial, EThread, import_matplotlib, sanitize_filename, makedirs, get_url, flatten_list, \
     get_device, ProgressParallel, remove, hash_file, clear_torch_cache, NullContext, get_hf_server, FakeTokenizer, \
+    have_libreoffice, have_arxiv, have_playwright, have_selenium, have_tesseract, have_pymupdf, set_openai
 from utils_langchain import StreamingGradioCallbackHandler
 import_matplotlib()
         db = get_existing_db(None, persist_directory, load_db_if_exists, db_type, use_openai_embedding, langchain_mode,
                              hf_embedding_model, verbose=False)
         if db is None:
+            from chromadb.config import Settings
+            client_settings = Settings(anonymized_telemetry=False,
+                                       chroma_db_impl="duckdb+parquet",
+                                       persist_directory=persist_directory)
             db = Chroma.from_documents(documents=sources,
                                        embedding=embedding,
                                        persist_directory=persist_directory,
                                        collection_name=collection_name,
+                                       client_settings=client_settings)
             db.persist()
             clear_embedding(db)
             save_embed(db, use_openai_embedding, hf_embedding_model)
 from pydantic import Extra, Field, root_validator
+from langchain.callbacks.manager import CallbackManagerForLLMRun, Callbacks
 from langchain.llms.base import LLM
     sanitize_bot_response: bool = False
     prompter: Any = None
+    context: Any = ''
+    iinput: Any = ''
     client: Any = None
     class Config:
         stream_output = self.stream
         gr_client = self.client
         client_langchain_mode = 'Disabled'
+        client_add_chat_history_to_context = True
         client_langchain_action = LangChainAction.QUERY.value
+        client_langchain_agents = []
         top_k_docs = 1
         chunk = True
         chunk_size = 512
         client_kwargs = dict(instruction=prompt if self.chat_client else '',  # only for chat=True
+                             iinput=self.iinput if self.chat_client else '',  # only for chat=True
+                             context=self.context,
                              # streaming output is supported, loops over and outputs each generation in streaming mode
                              # but leave stream_output=False for simple input/output mode
                              stream_output=stream_output,
                              chat=self.chat_client,
                              instruction_nochat=prompt if not self.chat_client else '',
+                             iinput_nochat=self.iinput if not self.chat_client else '',
                              langchain_mode=client_langchain_mode,
+                             add_chat_history_to_context=client_add_chat_history_to_context,
                              langchain_action=client_langchain_action,
+                             langchain_agents=client_langchain_agents,
                              top_k_docs=top_k_docs,
                              chunk=chunk,
                              chunk_size=chunk_size,
+                             document_subset=DocumentSubset.Relevant.name,
+                             document_choice=[DocumentChoice.ALL.value],
                              )
         api_name = '/submit_nochat_api'  # NOTE: like submit_nochat but stable API for string dict passing
         if not stream_output:
     stream: bool = False
     sanitize_bot_response: bool = False
     prompter: Any = None
+    context: Any = ''
+    iinput: Any = ''
     tokenizer: Any = None
     client: Any = None
         prompt, num_prompt_tokens = H2OTextGenerationPipeline.limit_prompt(prompt, self.tokenizer)
         # NOTE: TGI server does not add prompting, so must do here
+        data_point = dict(context=self.context, instruction=prompt, input=self.iinput)
         prompt = self.prompter.generate_prompt(data_point)
         gen_server_kwargs = dict(do_sample=self.do_sample,
 from langchain.chat_models import ChatOpenAI
+from langchain.llms import OpenAI
+from langchain.llms.openai import _streaming_response_template, completion_with_retry, _update_response, \
+    update_token_usage
+class H2OOpenAI(OpenAI):
+    """
+    New class to handle vLLM's use of OpenAI, no vllm_chat supported, so only need here
+    Handles prompting that OpenAI doesn't need, stopping as well
+    """
+    stop_sequences: Any = None
+    sanitize_bot_response: bool = False
+    prompter: Any = None
+    context: Any = ''
+    iinput: Any = ''
+    tokenizer: Any = None
+    @classmethod
+    def all_required_field_names(cls) -> Set:
+        all_required_field_names = super(OpenAI, cls).all_required_field_names()
+        all_required_field_names.update(
+            {'top_p', 'frequency_penalty', 'presence_penalty', 'stop_sequences', 'sanitize_bot_response', 'prompter',
+             'tokenizer'})
+        return all_required_field_names
+    def _generate(
+            self,
+            prompts: List[str],
+            stop: Optional[List[str]] = None,
+            run_manager: Optional[CallbackManagerForLLMRun] = None,
+            **kwargs: Any,
+    ) -> LLMResult:
+        stop = self.stop_sequences if not stop else self.stop_sequences + stop
+        # HF inference server needs control over input tokens
+        assert self.tokenizer is not None
+        from h2oai_pipeline import H2OTextGenerationPipeline
+        for prompti, prompt in enumerate(prompts):
+            prompt, num_prompt_tokens = H2OTextGenerationPipeline.limit_prompt(prompt, self.tokenizer)
+            # NOTE: OpenAI/vLLM server does not add prompting, so must do here
+            data_point = dict(context=self.context, instruction=prompt, input=self.iinput)
+            prompt = self.prompter.generate_prompt(data_point)
+            prompts[prompti] = prompt
+        params = self._invocation_params
+        params = {**params, **kwargs}
+        sub_prompts = self.get_sub_prompts(params, prompts, stop)
+        choices = []
+        token_usage: Dict[str, int] = {}
+        # Get the token usage from the response.
+        # Includes prompt, completion, and total tokens used.
+        _keys = {"completion_tokens", "prompt_tokens", "total_tokens"}
+        text = ''
+        for _prompts in sub_prompts:
+            if self.streaming:
+                text_with_prompt = ""
+                prompt = _prompts[0]
+                if len(_prompts) > 1:
+                    raise ValueError("Cannot stream results with multiple prompts.")
+                params["stream"] = True
+                response = _streaming_response_template()
+                first = True
+                for stream_resp in completion_with_retry(
+                        self, prompt=_prompts, **params
+                ):
+                    if first:
+                        stream_resp["choices"][0]["text"] = prompt + stream_resp["choices"][0]["text"]
+                        first = False
+                    text_chunk = stream_resp["choices"][0]["text"]
+                    text_with_prompt += text_chunk
+                    text = self.prompter.get_response(text_with_prompt, prompt=prompt,
+                                                      sanitize_bot_response=self.sanitize_bot_response)
+                    if run_manager:
+                        run_manager.on_llm_new_token(
+                            text_chunk,
+                            verbose=self.verbose,
+                            logprobs=stream_resp["choices"][0]["logprobs"],
+                        )
+                    _update_response(response, stream_resp)
+                choices.extend(response["choices"])
+            else:
+                response = completion_with_retry(self, prompt=_prompts, **params)
+                choices.extend(response["choices"])
+            if not self.streaming:
+                # Can't update token usage if streaming
+                update_token_usage(_keys, response, token_usage)
+        choices[0]['text'] = text
+        return self.create_llm_result(choices, prompts, token_usage)
 class H2OChatOpenAI(ChatOpenAI):
             prompt_type=None,
             prompt_dict=None,
             prompter=None,
+            context=None,
+            iinput=None,
             sanitize_bot_response=False,
             verbose=False,
             ):
+    if inference_server is None:
+        inference_server = ''
+    if use_openai_model or inference_server.startswith('openai') or inference_server.startswith('vllm'):
         if use_openai_model and model_name is None:
             model_name = "gpt-3.5-turbo"
+        # FIXME: Will later import be ignored?  I think so, so should be fine
+        openai, inf_type = set_openai(inference_server)
+        kwargs_extra = {}
+        if inference_server == 'openai_chat' or inf_type == 'vllm_chat':
             cls = H2OChatOpenAI
+            # FIXME: Support context, iinput
+        else:
+            cls = H2OOpenAI
+            if inf_type == 'vllm':
+                terminate_response = prompter.terminate_response or []
+                stop_sequences = list(set(terminate_response + [prompter.PreResponse]))
+                stop_sequences = [x for x in stop_sequences if x]
+                kwargs_extra = dict(stop_sequences=stop_sequences,
+                                    sanitize_bot_response=sanitize_bot_response,
+                                    prompter=prompter,
+                                    context=context,
+                                    iinput=iinput,
+                                    tokenizer=tokenizer,
+                                    client=None)
         callbacks = [StreamingGradioCallbackHandler()]
         llm = cls(model_name=model_name,
                   temperature=temperature if do_sample else 0,
                   frequency_penalty=0,
                   presence_penalty=1.07 - repetition_penalty + 0.6,  # so good default
                   callbacks=callbacks if stream_output else None,
+                  openai_api_key=openai.api_key,
+                  openai_api_base=openai.api_base,
+                  logit_bias=None if inf_type == 'vllm' else {},
+                  max_retries=2,
+                  streaming=stream_output,
+                  **kwargs_extra
                   )
         streamer = callbacks[0] if stream_output else None
         if inference_server in ['openai', 'openai_chat']:
             prompt_type = inference_server
         else:
+            # vllm goes here
             prompt_type = prompt_type or 'plain'
     elif inference_server:
         assert inference_server.startswith(
                 callbacks=callbacks if stream_output else None,
                 stream=stream_output,
                 prompter=prompter,
+                context=context,
+                iinput=iinput,
                 client=gr_client,
                 sanitize_bot_response=sanitize_bot_response,
             )
                 callbacks=callbacks if stream_output else None,
                 stream=stream_output,
                 prompter=prompter,
+                context=context,
+                iinput=iinput,
                 tokenizer=tokenizer,
                 client=hf_client,
                 timeout=max_time,
                               verbose=verbose,
                               streaming=stream_output,
                               prompter=prompter,
+                              context=context,
+                              iinput=iinput,
                               )
     else:
         if model is None:
         from h2oai_pipeline import H2OTextGenerationPipeline
         pipe = H2OTextGenerationPipeline(model=model, use_prompter=True,
                                          prompter=prompter,
+                                         context=context,
+                                         iinpout=iinput,
                                          prompt_type=prompt_type,
                                          prompt_dict=prompt_dict,
                                          sanitize_bot_response=sanitize_bot_response,
     return sources
 image_types = ["png", "jpg", "jpeg"]
 non_image_types = ["pdf", "txt", "csv", "toml", "py", "rst", "rtf",
                    "md",
                    ]
 # "msg",  GPL3
+if have_libreoffice or True:
+    # or True so it tries to load, e.g. on MAC/Windows, even if don't have libreoffice since works without that
     non_image_types.extend(["docx", "doc", "xls", "xlsx"])
 file_types = non_image_types + image_types
 def add_meta(docs1, file):
     file_extension = pathlib.Path(file).suffix
     hashid = hash_file(file)
+    doc_hash = str(uuid.uuid4())[:10]
     if not isinstance(docs1, (list, tuple, types.GeneratorType)):
         docs1 = [docs1]
+    [x.metadata.update(dict(input_type=file_extension, date=str(datetime.now()), hashid=hashid, doc_hash=doc_hash)) for
+     x in docs1]
 def file_to_doc(file, base_path=None, verbose=False, fail_any_exception=False,
                 is_url=False, is_txt=False,
                 enable_captions=True,
                 captions_model=None,
+                enable_ocr=False, enable_pdf_ocr='auto', caption_loader=None,
                 headsize=50):
     if file is None:
         if fail_any_exception:
         base_name = sanitize_filename(base_name) + "_" + str(uuid.uuid4())[:10]
         base_path = os.path.join(dir_name, base_name)
     if is_url:
+        file = file.strip()  # in case accidental spaces in front or at end
         if file.lower().startswith('arxiv:'):
             query = file.lower().split('arxiv:')
             if len(query) == 2 and have_arxiv:
         add_meta(docs1, file)
         docs1 = clean_doc(docs1)
         doc1 = chunk_sources(docs1, chunk=chunk, chunk_size=chunk_size, language=Language.HTML)
+    elif (file.lower().endswith('.docx') or file.lower().endswith('.doc')) and (have_libreoffice or True):
         docs1 = UnstructuredWordDocumentLoader(file_path=file).load()
         add_meta(docs1, file)
         doc1 = chunk_sources(docs1, chunk=chunk, chunk_size=chunk_size)
+    elif (file.lower().endswith('.xlsx') or file.lower().endswith('.xls')) and (have_libreoffice or True):
         docs1 = UnstructuredExcelLoader(file_path=file).load()
         add_meta(docs1, file)
         doc1 = chunk_sources(docs1, chunk=chunk, chunk_size=chunk_size)
         from dotenv import dotenv_values
         env_kwargs = dotenv_values(env_gpt4all_file)
         pdf_class_name = env_kwargs.get('PDF_CLASS_NAME', 'PyMuPDFParser')
+        doc1 = []
+        handled = False
         if have_pymupdf and pdf_class_name == 'PyMuPDFParser':
             # GPL, only use if installed
             from langchain.document_loaders import PyMuPDFLoader
             # load() still chunks by pages, but every page has title at start to help
             doc1 = PyMuPDFLoader(file).load()
+            # remove empty documents
+            handled |= len(doc1) > 0
+            doc1 = [x for x in doc1 if x.page_content]
             doc1 = clean_doc(doc1)
+        if len(doc1) == 0:
             doc1 = UnstructuredPDFLoader(file).load()
+            handled |= len(doc1) > 0
+            # remove empty documents
+            doc1 = [x for x in doc1 if x.page_content]
             # seems to not need cleaning in most cases
+        if len(doc1) == 0:
             # open-source fallback
             # load() still chunks by pages, but every page has title at start to help
             doc1 = PyPDFLoader(file).load()
+            handled |= len(doc1) > 0
+            # remove empty documents
+            doc1 = [x for x in doc1 if x.page_content]
+            doc1 = clean_doc(doc1)
+        if have_pymupdf and len(doc1) == 0:
+            # GPL, only use if installed
+            from langchain.document_loaders import PyMuPDFLoader
+            # load() still chunks by pages, but every page has title at start to help
+            doc1 = PyMuPDFLoader(file).load()
+            handled |= len(doc1) > 0
+            # remove empty documents
+            doc1 = [x for x in doc1 if x.page_content]
             doc1 = clean_doc(doc1)
+        if len(doc1) == 0 and enable_pdf_ocr == 'auto' or enable_pdf_ocr == 'on':
+            # try OCR in end since slowest, but works on pure image pages well
+            doc1 = UnstructuredPDFLoader(file, strategy='ocr_only').load()
+            handled |= len(doc1) > 0
+            # remove empty documents
+            doc1 = [x for x in doc1 if x.page_content]
+            # seems to not need cleaning in most cases
         # Some PDFs return nothing or junk from PDFMinerLoader
+        if len(doc1) == 0:
+            # if literally nothing, show failed to parse so user knows, since unlikely nothing in PDF at all.
+            if handled:
+                raise ValueError("%s had no valid text, but meta data was parsed" % file)
+            else:
+                raise ValueError("%s had no valid text and no meta data was parsed" % file)
         doc1 = chunk_sources(doc1, chunk=chunk, chunk_size=chunk_size)
         add_meta(doc1, file)
     elif file.lower().endswith('.csv'):
                  is_url=False, is_txt=False,
                  enable_captions=True,
                  captions_model=None,
+                 enable_ocr=False, enable_pdf_ocr='auto', caption_loader=None):
     if verbose:
         if is_url:
             print("Ingesting URL: %s" % file, flush=True)
                           enable_captions=enable_captions,
                           captions_model=captions_model,
                           enable_ocr=enable_ocr,
+                          enable_pdf_ocr=enable_pdf_ocr,
                           caption_loader=caption_loader)
     except BaseException as e:
         print("Failed to ingest %s due to %s" % (file, traceback.format_exc()))
         else:
             exception_doc = Document(
                 page_content='',
+                metadata={"source": file, "exception": '%s Exception: %s' % (file, str(e)),
                           "traceback": traceback.format_exc()})
             res = [exception_doc]
     if return_file:
                  captions_model=None,
                  caption_loader=None,
                  enable_ocr=False,
+                 enable_pdf_ocr='auto',
                  existing_files=[],
                  existing_hash_ids={},
                  ):
         [globs_non_image_types.extend(glob.glob(os.path.join(path, "./**/*.%s" % ftype), recursive=True))
          for ftype in non_image_types]
     else:
+        if isinstance(path_or_paths, str):
+            if os.path.isfile(path_or_paths) or os.path.isdir(path_or_paths):
+                path_or_paths = [path_or_paths]
+            else:
+                # path was deleted etc.
+                return []
         # list/tuple of files (consume what can, and exception those that selected but cannot consume so user knows)
+        assert isinstance(path_or_paths, (list, tuple, types.GeneratorType)), \
+            "Wrong type for path_or_paths: %s %s" % (path_or_paths, type(path_or_paths))
         # reform out of allowed types
         globs_image_types.extend(flatten_list([[x for x in path_or_paths if x.endswith(y)] for y in image_types]))
         # could do below:
                   captions_model=captions_model,
                   caption_loader=caption_loader,
                   enable_ocr=enable_ocr,
+                  enable_pdf_ocr=enable_pdf_ocr,
                   )
     if n_jobs != 1 and len(globs_non_image_types) > 1:
             with open(fil, 'rb') as f:
                 documents.extend(pickle.load(f))
             # remove temp pickle
+            remove(fil)
     else:
         documents = reduce(concat, documents)
     return documents
 def prep_langchain(persist_directory,
                    load_db_if_exists,
+                   db_type, use_openai_embedding, langchain_mode, langchain_mode_paths,
                    hf_embedding_model, n_jobs=-1, kwargs_make_db={}):
     """
     do prep first time, involving downloads
     assert langchain_mode not in ['MyData'], "Should not prep scratch data"
     db_dir_exists = os.path.isdir(persist_directory)
+    user_path = langchain_mode_paths.get(langchain_mode)
     if db_dir_exists and user_path is None:
         print("Prep: persist_directory=%s exists, using" % persist_directory, flush=True)
             langchain_kwargs[k] = defaults_db[k]
     # final check for missing
     missing_kwargs = [x for x in func_names if x not in langchain_kwargs]
+    assert not missing_kwargs, "Missing kwargs for make_db: %s" % missing_kwargs
     # only keep actual used
     langchain_kwargs = {k: v for k, v in langchain_kwargs.items() if k in func_names}
     return _make_db(**langchain_kwargs)
              first_para=False, text_limit=None,
              chunk=True, chunk_size=512,
              langchain_mode=None,
+             langchain_mode_paths=None,
              db_type='faiss',
              load_db_if_exists=True,
              db=None,
              n_jobs=-1,
              verbose=False):
     persist_directory = get_persist_directory(langchain_mode)
+    user_path = langchain_mode_paths.get(langchain_mode)
     # see if can get persistent chroma db
     db_trial = get_existing_db(db, persist_directory, load_db_if_exists, db_type, use_openai_embedding, langchain_mode,
                                hf_embedding_model, verbose=verbose)
         db = db_trial
     sources = []
+    if not db:
+        if langchain_mode in ['wiki_full']:
             from read_wiki_full import get_all_documents
             small_test = None
             print("Generating new wiki", flush=True)
                 sources1 = chunk_sources(sources1, chunk=chunk, chunk_size=chunk_size)
                 print("Chunked new wiki", flush=True)
             sources.extend(sources1)
+        elif langchain_mode in ['wiki']:
             sources1 = get_wiki_sources(first_para=first_para, text_limit=text_limit)
             if chunk:
                 sources1 = chunk_sources(sources1, chunk=chunk, chunk_size=chunk_size)
             sources.extend(sources1)
+        elif langchain_mode in ['github h2oGPT']:
             # sources = get_github_docs("dagster-io", "dagster")
             sources1 = get_github_docs("h2oai", "h2ogpt")
             # FIXME: always chunk for now
             sources1 = chunk_sources(sources1, chunk=chunk, chunk_size=chunk_size)
             sources.extend(sources1)
+        elif langchain_mode in ['DriverlessAI docs']:
             sources1 = get_dai_docs(from_hf=True)
             if chunk and False:  # FIXME: DAI docs are already chunked well, should only chunk more if over limit
                 sources1 = chunk_sources(sources1, chunk=chunk, chunk_size=chunk_size)
             sources.extend(sources1)
+    if user_path:
+        # UserData or custom, which has to be from user's disk
+        if db is not None:
+            # NOTE: Ignore file names for now, only go by hash ids
+            # existing_files = get_existing_files(db)
+            existing_files = []
+            existing_hash_ids = get_existing_hash_ids(db)
+        else:
+            # pretend no existing files so won't filter
+            existing_files = []
+            existing_hash_ids = []
+        # chunk internally for speed over multiple docs
+        # FIXME: If first had old Hash=None and switch embeddings,
+        #  then re-embed, and then hit here and reload so have hash, and then re-embed.
+        sources1 = path_to_docs(user_path, n_jobs=n_jobs, chunk=chunk, chunk_size=chunk_size,
+                                existing_files=existing_files, existing_hash_ids=existing_hash_ids)
+        new_metadata_sources = set([x.metadata['source'] for x in sources1])
+        if new_metadata_sources:
+            print("Loaded %s new files as sources to add to %s" % (len(new_metadata_sources), langchain_mode),
+                  flush=True)
+            if verbose:
+                print("Files added: %s" % '\n'.join(new_metadata_sources), flush=True)
+        sources.extend(sources1)
+        print("Loaded %s sources for potentially adding to %s" % (len(sources), langchain_mode), flush=True)
+        # see if got sources
         if not sources:
             if verbose:
                 if db is not None:
         else:
             print("Did not generate db since no sources", flush=True)
         new_sources_metadata = [x.metadata for x in sources]
+    elif user_path is not None:
         print("Existing db, potentially adding %s sources from user_path=%s" % (len(sources), user_path), flush=True)
         db, num_new_sources, new_sources_metadata = add_to_db(db, sources, db_type=db_type,
                                                               use_openai_embedding=use_openai_embedding,
     kwargs['answer_with_sources'] = True
     kwargs['show_rank'] = False
     missing_kwargs = [x for x in func_names if x not in kwargs]
+    assert not missing_kwargs, "Missing kwargs for run_qa_db: %s" % missing_kwargs
     # only keep actual used
     kwargs = {k: v for k, v in kwargs.items() if k in func_names}
     try:
                context=None,
                use_openai_model=False, use_openai_embedding=False,
                first_para=False, text_limit=None, top_k_docs=4, chunk=True, chunk_size=512,
+               langchain_mode_paths={},
                detect_user_path_changes_every_query=False,
                db_type='faiss',
                model_name=None, model=None, tokenizer=None, inference_server=None,
                prompt_type=None,
                prompt_dict=None,
                answer_with_sources=True,
+               cut_distance=1.64,
+               add_chat_history_to_context=True,
                sanitize_bot_response=False,
                show_rank=False,
+               use_llm_if_no_docs=False,
                load_db_if_exists=False,
                db=None,
                do_sample=False,
                num_return_sequences=1,
                langchain_mode=None,
                langchain_action=None,
+               langchain_agents=None,
+               document_subset=DocumentSubset.Relevant.name,
+               document_choice=[DocumentChoice.ALL.value],
                n_jobs=-1,
                verbose=False,
                cli=False,
     :param top_k_docs:
     :param chunk:
     :param chunk_size:
+    :param langchain_mode_paths: dict of langchain_mode -> user path to glob recursively from
     :param db_type: 'faiss' for in-memory db or 'chroma' or 'weaviate' for persistent db
     :param model_name: model name, used to switch behaviors
     :param model: pre-initialized model, else will make new one
     :param answer_with_sources
     :return:
     """
+    assert langchain_mode_paths is not None
     if model is not None:
         assert model_name is not None  # require so can make decisions
     assert query is not None
         else:
             prompt_dict = ''
     assert len(set(gen_hyper).difference(inspect.signature(get_llm).parameters)) == 0
+    # pass in context to LLM directly, since already has prompt_type structure
+    # can't pass through langchain in get_chain() to LLM: https://github.com/hwchase17/langchain/issues/6638
     llm, model_name, streamer, prompt_type_out = get_llm(use_openai_model=use_openai_model, model_name=model_name,
                                                          model=model,
                                                          tokenizer=tokenizer,
                                                          prompt_type=prompt_type,
                                                          prompt_dict=prompt_dict,
                                                          prompter=prompter,
+                                                         context=context if add_chat_history_to_context else '',
+                                                         iinput=iinput if add_chat_history_to_context else '',
                                                          sanitize_bot_response=sanitize_bot_response,
                                                          verbose=verbose,
                                                          )
+    use_docs_planned = False
     scores = []
     chain = None
     sim_kwargs = {k: v for k, v in locals().items() if k in func_names}
     missing_kwargs = [x for x in func_names if x not in sim_kwargs]
     assert not missing_kwargs, "Missing: %s" % missing_kwargs
+    docs, chain, scores, use_docs_planned, have_any_docs = get_chain(**sim_kwargs)
     if document_subset in non_query_commands:
         formatted_doc_chunks = '\n\n'.join([get_url(x) + '\n\n' + x.page_content for x in docs])
+        if not formatted_doc_chunks and not use_llm_if_no_docs:
+            yield "No sources", ''
+            return
+        # if no souces, outside gpt_langchain, LLM will be used with '' input
         yield formatted_doc_chunks, ''
         return
+    if not use_llm_if_no_docs:
+        if not docs and langchain_action in [LangChainAction.SUMMARIZE_MAP.value,
+                                             LangChainAction.SUMMARIZE_ALL.value,
+                                             LangChainAction.SUMMARIZE_REFINE.value]:
+            ret = 'No relevant documents to summarize.' if have_any_docs else 'No documents to summarize.'
+            extra = ''
+            yield ret, extra
+            return
+        if not docs and langchain_mode not in [LangChainMode.DISABLED.value,
+                                               LangChainMode.LLM.value]:
+            ret = 'No relevant documents to query.' if have_any_docs else 'No documents to query.'
+            extra = ''
+            yield ret, extra
+            return
     if chain is None and model_name not in non_hf_types:
         # here if no docs at all and not HF type
             else:
                 answer = chain()
+    if not use_docs_planned:
         ret = answer['output_text']
         extra = ''
         yield ret, extra
 def get_chain(query=None,
               iinput=None,
+              context=None,  # FIXME: https://github.com/hwchase17/langchain/issues/6638
               use_openai_model=False, use_openai_embedding=False,
               first_para=False, text_limit=None, top_k_docs=4, chunk=True, chunk_size=512,
+              langchain_mode_paths=None,
               detect_user_path_changes_every_query=False,
               db_type='faiss',
               model_name=None,
               hf_embedding_model="sentence-transformers/all-MiniLM-L6-v2",
               prompt_type=None,
               prompt_dict=None,
+              cut_distance=1.1,
+              add_chat_history_to_context=True,  # FIXME: https://github.com/hwchase17/langchain/issues/6638
               load_db_if_exists=False,
               db=None,
               langchain_mode=None,
               langchain_action=None,
+              langchain_agents=None,
+              document_subset=DocumentSubset.Relevant.name,
+              document_choice=[DocumentChoice.ALL.value],
               n_jobs=-1,
               # beyond run_db_query:
               llm=None,
               auto_reduce_chunks=True,
               max_chunks=100,
               ):
+    assert langchain_agents is not None  # should be at least []
     # determine whether use of context out of docs is planned
     if not use_openai_model and prompt_type not in ['plain'] or model_name in non_hf_types:
+        if langchain_mode in ['Disabled', 'LLM']:
+            use_docs_planned = False
         else:
+            use_docs_planned = True
     else:
+        use_docs_planned = True
     # https://github.com/hwchase17/langchain/issues/1946
     # FIXME: Seems to way to get size of chroma db to limit top_k_docs to avoid
         # avoid looking at user_path during similarity search db handling,
         # if already have db and not updating from user_path every query
         # but if db is None, no db yet loaded (e.g. from prep), so allow user_path to be whatever it was
+        if langchain_mode_paths is None:
+            langchain_mode_paths = {}
+        langchain_mode_paths = langchain_mode_paths.copy()
+        langchain_mode_paths[langchain_mode] = None
     db, num_new_sources, new_sources_metadata = make_db(use_openai_embedding=use_openai_embedding,
                                                         hf_embedding_model=hf_embedding_model,
                                                         first_para=first_para, text_limit=text_limit,
                                                         chunk=chunk,
                                                         chunk_size=chunk_size,
                                                         langchain_mode=langchain_mode,
+                                                        langchain_mode_paths=langchain_mode_paths,
                                                         db_type=db_type,
                                                         load_db_if_exists=load_db_if_exists,
                                                         db=db,
         else:
             extra = ""
             prefix = ""
+        if langchain_mode in ['Disabled', 'LLM'] or not use_docs_planned:
             template_if_no_docs = template = """%s{context}{question}""" % prefix
         else:
             template = """%s
     else:
         use_template = False
+    if db and use_docs_planned:
         base_path = 'locks'
         makedirs(base_path)
         if hasattr(db, '_persist_directory'):
             filter_kwargs = {}
         else:
             assert document_choice is not None, "Document choice was None"
+            if len(document_choice) >= 1 and document_choice[0] == DocumentChoice.ALL.value:
                 filter_kwargs = {}
             elif len(document_choice) >= 2:
+                if document_choice[0] == DocumentChoice.ALL.value:
                     # remove 'All'
                     document_choice = document_choice[1:]
                 or_filter = [{"source": {"$eq": x}} for x in document_choice]
             else:
                 # shouldn't reach
                 filter_kwargs = {}
+        if langchain_mode in [LangChainMode.LLM.value]:
             docs = []
             scores = []
+        elif document_subset == DocumentSubset.TopKSources.name or query in [None, '', '\n']:
             db_documents, db_metadatas = get_docs_and_meta(db, top_k_docs, filter_kwargs=filter_kwargs)
             # similar to langchain's chroma's _results_to_docs_and_scores
             docs_with_score = [(Document(page_content=result[0], metadata=result[1] or {}), 0)
                                for result in zip(db_documents, db_metadatas)]
             # order documents
+            doc_hashes = [x.get('doc_hash', 'None') for x in db_metadatas]
+            doc_chunk_ids = [x.get('chunk_id', 0) for x in db_metadatas]
             docs_with_score = [x for _, _, x in
                                sorted(zip(doc_hashes, doc_chunk_ids, docs_with_score), key=lambda x: (x[0], x[1]))
                                ]
                 docs_with_score.reverse()
             # cut off so no high distance docs/sources considered
             have_any_docs |= len(docs_with_score) > 0  # before cut
+            docs = [x[0] for x in docs_with_score if x[1] < cut_distance]
+            scores = [x[1] for x in docs_with_score if x[1] < cut_distance]
             if len(scores) > 0 and verbose:
                 print("Distance: min: %s max: %s mean: %s median: %s" %
                       (scores[0], scores[-1], np.mean(scores), np.median(scores)), flush=True)
         docs = []
         scores = []
+    if not docs and use_docs_planned and model_name not in non_hf_types:
         # if HF type and have no docs, can bail out
         return docs, None, [], False, have_any_docs
     if len(docs) == 0:
         # avoid context == in prompt then
+        use_docs_planned = False
         template = template_if_no_docs
     if langchain_action == LangChainAction.QUERY.value:
         else:
             # only if use_openai_model = True, unused normally except in testing
             chain = load_qa_with_sources_chain(llm)
+        if not use_docs_planned:
             chain_kwargs = dict(input_documents=[], question=query)
         else:
             chain_kwargs = dict(input_documents=docs, question=query)
     else:
         raise RuntimeError("No such langchain_action=%s" % langchain_action)
+    return docs, target, scores, use_docs_planned, have_any_docs
 def get_sources_answer(query, answer, scores, show_rank, answer_with_sources, verbose=False):
 def chunk_sources(sources, chunk=True, chunk_size=512, language=None):
     if not chunk:
+        [x.metadata.update(dict(chunk_id=chunk_id)) for chunk_id, x in enumerate(sources)]
         return sources
     if not isinstance(sources, (list, tuple, types.GeneratorType)) and not callable(sources):
         # if just one document
     source_chunks = splitter.split_documents(sources)
     # currently in order, but when pull from db won't be, so mark order and document by hash
+    [x.metadata.update(dict(chunk_id=chunk_id)) for chunk_id, x in enumerate(source_chunks)]
     return source_chunks

gradio_runner.py CHANGED Viewed

@@ -50,16 +50,20 @@ def fix_pydantic_duplicate_validators_error():
 fix_pydantic_duplicate_validators_error()
-from enums import DocumentChoices, no_model_str, no_lora_str, no_server_str, LangChainAction, LangChainMode
 from gradio_themes import H2oTheme, SoftTheme, get_h2o_title, get_simple_title, get_dark_js, spacing_xsm, radius_xsm, \
     text_xsm
 from prompter import prompt_type_to_model_name, prompt_types_strings, inv_prompt_type_to_model_lower, non_hf_types, \
     get_prompt
-from utils import get_githash, flatten_list, zip_data, s3up, clear_torch_cache, get_torch_allocated, system_info_print, \
-    ping, get_short_name, makedirs, get_kwargs, remove, system_info, ping_gpu, get_url, get_local_ip
-from gen import get_model, languages_covered, evaluate, score_qa, langchain_modes, inputs_kwargs_list, scratch_base_dir, \
-    get_max_max_new_tokens, get_minmax_top_k_docs, history_to_context, langchain_actions
-from evaluate_params import eval_func_param_names, no_default_param_names, eval_func_param_names_defaults
 from apscheduler.schedulers.background import BackgroundScheduler
@@ -94,13 +98,11 @@ def go_gradio(**kwargs):
     memory_restriction_level = kwargs['memory_restriction_level']
     n_gpus = kwargs['n_gpus']
     admin_pass = kwargs['admin_pass']
-    model_state0 = kwargs['model_state0']
     model_states = kwargs['model_states']
-    score_model_state0 = kwargs['score_model_state0']
     dbs = kwargs['dbs']
     db_type = kwargs['db_type']
-    visible_langchain_modes = kwargs['visible_langchain_modes']
     visible_langchain_actions = kwargs['visible_langchain_actions']
     allow_upload_to_user_data = kwargs['allow_upload_to_user_data']
     allow_upload_to_my_data = kwargs['allow_upload_to_my_data']
     enable_sources_list = kwargs['enable_sources_list']
@@ -111,8 +113,19 @@ def go_gradio(**kwargs):
     enable_captions = kwargs['enable_captions']
     captions_model = kwargs['captions_model']
     enable_ocr = kwargs['enable_ocr']
     caption_loader = kwargs['caption_loader']
     # easy update of kwargs needed for evaluate() etc.
     queue = True
     allow_upload = allow_upload_to_user_data or allow_upload_to_my_data
@@ -132,25 +145,11 @@ def go_gradio(**kwargs):
                                    " use Enter for multiple input lines)"
     title = 'h2oGPT'
-    more_info = """<iframe src="https://ghbtns.com/github-btn.html?user=h2oai&repo=h2ogpt&type=star&count=true&size=small" frameborder="0" scrolling="0" width="250" height="20" title="GitHub"></iframe><small><a href="https://github.com/h2oai/h2ogpt">h2oGPT</a>  <a href="https://github.com/h2oai/h2o-llmstudio">H2O LLM Studio</a><br><a href="https://huggingface.co/h2oai">🤗 Models</a>"""
-    if kwargs['verbose']:
-        description = f"""Model {kwargs['base_model']} Instruct dataset.
-                      For more information, visit our GitHub pages: [h2oGPT](https://github.com/h2oai/h2ogpt) and [H2O LLM Studio](https://github.com/h2oai/h2o-llmstudio).
-                      Command: {str(' '.join(sys.argv))}
-                      Hash: {get_githash()}
-                      """
-    else:
-        description = more_info
-    description_bottom = "If this host is busy, try [Multi-Model](https://gpt.h2o.ai), [Falcon 40B](http://falcon.h2o.ai), [HF Spaces1](https://huggingface.co/spaces/h2oai/h2ogpt-chatbot) or [HF Spaces2](https://huggingface.co/spaces/h2oai/h2ogpt-chatbot2)<br>"
     if is_hf:
         description_bottom += '''<a href="https://huggingface.co/spaces/h2oai/h2ogpt-chatbot?duplicate=true"><img src="https://bit.ly/3gLdBN6" style="white-space: nowrap" alt="Duplicate Space"></a>'''
-    if kwargs['verbose']:
-        task_info_md = f"""
-        ### Task: {kwargs['task_info']}"""
-    else:
-        task_info_md = ''
     css_code = get_css(kwargs)
     if kwargs['gradio_offline_level'] >= 0:
@@ -180,9 +179,9 @@ def go_gradio(**kwargs):
     demo = gr.Blocks(theme=theme, css=css_code, title="h2oGPT", analytics_enabled=False)
     callback = gr.CSVLogger()
-    model_options = flatten_list(list(prompt_type_to_model_name.values())) + kwargs['extra_model_options']
-    if kwargs['base_model'].strip() not in model_options:
-        model_options = [kwargs['base_model'].strip()] + model_options
     lora_options = kwargs['extra_lora_options']
     if kwargs['lora_weights'].strip() not in lora_options:
         lora_options = [kwargs['lora_weights'].strip()] + lora_options
@@ -197,7 +196,7 @@ def go_gradio(**kwargs):
     # always add in no lora case
     # add fake space so doesn't go away in gradio dropdown
-    model_options = [no_model_str] + model_options
     lora_options = [no_lora_str] + lora_options
     server_options = [no_server_str] + server_options
     # always add in no model case so can free memory
@@ -251,6 +250,14 @@ def go_gradio(**kwargs):
         # else gets input_list at time of submit that is old, and shows up as truncated in chatbot
         return x
     with demo:
         # avoid actual model/tokenizer here or anything that would be bad to deepcopy
         # https://github.com/gradio-app/gradio/issues/3558
@@ -264,18 +271,32 @@ def go_gradio(**kwargs):
                  prompt_dict=kwargs['prompt_dict'],
                  )
         )
         model_state2 = gr.State(kwargs['model_state_none'].copy())
-        model_options_state = gr.State([model_options])
         lora_options_state = gr.State([lora_options])
         server_options_state = gr.State([server_options])
-        my_db_state = gr.State([None, None])
         chat_state = gr.State({})
-        docs_state00 = kwargs['document_choice'] + [DocumentChoices.All.name]
         docs_state0 = []
         [docs_state0.append(x) for x in docs_state00 if x not in docs_state0]
         docs_state = gr.State(docs_state0)
         viewable_docs_state0 = []
         viewable_docs_state = gr.State(viewable_docs_state0)
         gr.Markdown(f"""
             {get_h2o_title(title, description) if kwargs['h2ocolors'] else get_simple_title(title, description)}
             """)
@@ -289,7 +310,7 @@ def go_gradio(**kwargs):
             'model_lock'] else "Response Scores: %s" % nas
         if kwargs['langchain_mode'] != LangChainMode.DISABLED.value:
-            extra_prompt_form = ".  For summarization, empty submission uses first top_k_docs documents."
         else:
             extra_prompt_form = ""
         if kwargs['input_lines'] > 1:
@@ -297,6 +318,34 @@ def go_gradio(**kwargs):
         else:
             instruction_label = "Enter to Submit, Shift-Enter for more lines%s" % extra_prompt_form
         normal_block = gr.Row(visible=not base_wanted, equal_height=False)
         with normal_block:
             side_bar = gr.Column(elem_id="col_container", scale=1, min_width=100)
@@ -317,6 +366,7 @@ def go_gradio(**kwargs):
                                                     scale=1,
                                                     min_width=0,
                                                     elem_id="warning", elem_classes="feedback")
                     url_visible = kwargs['langchain_mode'] != 'Disabled' and allow_upload and enable_url_upload
                     url_label = 'URL/ArXiv' if have_arxiv else 'URL'
                     url_text = gr.Textbox(label=url_label,
@@ -330,29 +380,20 @@ def go_gradio(**kwargs):
                                                 visible=text_visible)
                     github_textbox = gr.Textbox(label="Github URL", visible=False)  # FIXME WIP
                 database_visible = kwargs['langchain_mode'] != 'Disabled'
-                with gr.Accordion("Database", open=False, visible=database_visible):
-                    if is_hf:
-                        # don't show 'wiki' since only usually useful for internal testing at moment
-                        no_show_modes = ['Disabled', 'wiki']
-                    else:
-                        no_show_modes = ['Disabled']
-                    allowed_modes = visible_langchain_modes.copy()
-                    allowed_modes = [x for x in allowed_modes if x in dbs]
-                    allowed_modes += ['ChatLLM', 'LLM']
-                    if allow_upload_to_my_data and 'MyData' not in allowed_modes:
-                        allowed_modes += ['MyData']
-                    if allow_upload_to_user_data and 'UserData' not in allowed_modes:
-                        allowed_modes += ['UserData']
                     langchain_mode = gr.Radio(
-                        [x for x in langchain_modes if x in allowed_modes and x not in no_show_modes],
                         value=kwargs['langchain_mode'],
                         label="Collections",
                         show_label=True,
                         visible=kwargs['langchain_mode'] != 'Disabled',
                         min_width=100)
-                    document_subset = gr.Radio([x.name for x in DocumentChoices],
                                                label="Subset",
-                                               value=DocumentChoices.Relevant.name,
                                                interactive=True,
                                                )
                     allowed_actions = [x for x in langchain_actions if x in visible_langchain_actions]
@@ -361,6 +402,14 @@ def go_gradio(**kwargs):
                         value=allowed_actions[0] if len(allowed_actions) > 0 else None,
                         label="Action",
                         visible=True)
             col_tabs = gr.Column(elem_id="col_container", scale=10)
             with (col_tabs, gr.Tabs()):
                 with gr.TabItem("Chat"):
@@ -408,9 +457,9 @@ def go_gradio(**kwargs):
                                     mw1 = 50
                                     mw2 = 50
                                     with gr.Column(min_width=mw1):
-                                        submit = gr.Button(value='Submit', variant='primary', scale=0, size='sm',
                                                            min_width=mw1)
-                                        stop_btn = gr.Button(value="Stop", variant='secondary', scale=0, size='sm',
                                                              min_width=mw1)
                                         save_chat_btn = gr.Button("Save", size='sm', min_width=mw1)
                                     with gr.Column(min_width=mw2):
@@ -431,20 +480,50 @@ def go_gradio(**kwargs):
                 with gr.TabItem("Document Selection"):
                     document_choice = gr.Dropdown(docs_state0,
                                                   label="Select Subset of Document(s) %s" % file_types_str,
-                                                  value='All',
                                                   interactive=True,
                                                   multiselect=True,
                                                   visible=kwargs['langchain_mode'] != 'Disabled',
                                                   )
                     sources_visible = kwargs['langchain_mode'] != 'Disabled' and enable_sources_list
                     with gr.Row():
-                        get_sources_btn = gr.Button(value="Update UI with Document(s) from DB", scale=0, size='sm',
-                                                    visible=sources_visible)
-                        show_sources_btn = gr.Button(value="Show Sources from DB", scale=0, size='sm',
-                                                     visible=sources_visible)
-                        refresh_sources_btn = gr.Button(value="Update DB with new/changed files on disk", scale=0,
-                                                        size='sm',
-                                                        visible=sources_visible and allow_upload_to_user_data)
                     sources_row = gr.Row(visible=kwargs['langchain_mode'] != 'Disabled' and enable_sources_list,
                                          equal_height=False)
@@ -469,6 +548,7 @@ def go_gradio(**kwargs):
                                                                value=None,
                                                                interactive=True,
                                                                multiselect=False,
                                                                )
                         with gr.Column(scale=4):
                             pass
@@ -713,19 +793,20 @@ def go_gradio(**kwargs):
                             side_bar_btn = gr.Button("Toggle SideBar", variant="secondary", size="sm")
                             submit_buttons_btn = gr.Button("Toggle Submit Buttons", variant="secondary", size="sm")
                             col_tabs_scale = gr.Slider(minimum=1, maximum=20, value=10, step=1, label='Window Size')
-                            text_outputs_height = gr.Slider(minimum=100, maximum=1000, value=kwargs['height'] or 400,
-                                                            step=100, label='Chat Height')
                             dark_mode_btn = gr.Button("Dark Mode", variant="secondary", size="sm")
                         with gr.Column(scale=4):
                             pass
                     admin_row = gr.Row()
                     with admin_row:
                         with gr.Column(scale=1):
-                            admin_pass_textbox = gr.Textbox(label="Admin Password", type='password', visible=is_public)
-                            admin_btn = gr.Button(value="Admin Access", visible=is_public, size='sm')
                         with gr.Column(scale=4):
                             pass
-                    system_row = gr.Row(visible=not is_public)
                     with system_row:
                         with gr.Column():
                             with gr.Row():
@@ -789,23 +870,24 @@ def go_gradio(**kwargs):
             else:
                 return tuple([gr.update(interactive=True)] * len(args))
-        # Add to UserData
         update_db_func = functools.partial(update_user_db,
                                            dbs=dbs,
                                            db_type=db_type,
                                            use_openai_embedding=use_openai_embedding,
                                            hf_embedding_model=hf_embedding_model,
-                                           enable_captions=enable_captions,
                                            captions_model=captions_model,
-                                           enable_ocr=enable_ocr,
                                            caption_loader=caption_loader,
                                            verbose=kwargs['verbose'],
-                                           user_path=kwargs['user_path'],
                                            n_jobs=kwargs['n_jobs'],
                                            )
         add_file_outputs = [fileup_output, langchain_mode]
         add_file_kwargs = dict(fn=update_db_func,
-                               inputs=[fileup_output, my_db_state, chunk, chunk_size, langchain_mode],
                                outputs=add_file_outputs + [sources_text, doc_exception_text],
                                queue=queue,
                                api_name='add_file' if allow_api and allow_upload_to_user_data else None)
@@ -817,6 +899,15 @@ def go_gradio(**kwargs):
         eventdb1b = eventdb1.then(make_interactive, inputs=add_file_outputs, outputs=add_file_outputs,
                                   show_progress='minimal')
         # note for update_user_db_func output is ignored for db
         def clear_textbox():
@@ -826,7 +917,8 @@ def go_gradio(**kwargs):
         add_url_outputs = [url_text, langchain_mode]
         add_url_kwargs = dict(fn=update_user_db_url_func,
-                              inputs=[url_text, my_db_state, chunk, chunk_size, langchain_mode],
                               outputs=add_url_outputs + [sources_text, doc_exception_text],
                               queue=queue,
                               api_name='add_url' if allow_api and allow_upload_to_user_data else None)
@@ -843,7 +935,8 @@ def go_gradio(**kwargs):
         update_user_db_txt_func = functools.partial(update_db_func, is_txt=True)
         add_text_outputs = [user_text_text, langchain_mode]
         add_text_kwargs = dict(fn=update_user_db_txt_func,
-                               inputs=[user_text_text, my_db_state, chunk, chunk_size, langchain_mode],
                                outputs=add_text_outputs + [sources_text, doc_exception_text],
                                queue=queue,
                                api_name='add_text' if allow_api and allow_upload_to_user_data else None
@@ -855,7 +948,7 @@ def go_gradio(**kwargs):
         eventdb3 = eventdb3b.then(**add_text_kwargs, show_progress='full')
         eventdb3c = eventdb3.then(make_interactive, inputs=add_text_outputs, outputs=add_text_outputs,
                                   show_progress='minimal')
-        db_events = [eventdb1a, eventdb1, eventdb1b,
                      eventdb2a, eventdb2, eventdb2b, eventdb2c,
                      eventdb3a, eventdb3b, eventdb3, eventdb3c]
@@ -863,14 +956,14 @@ def go_gradio(**kwargs):
         # if change collection source, must clear doc selections from it to avoid inconsistency
         def clear_doc_choice():
-            return gr.Dropdown.update(choices=docs_state0, value=DocumentChoices.All.name)
         langchain_mode.change(clear_doc_choice, inputs=None, outputs=document_choice, queue=False)
         def resize_col_tabs(x):
             return gr.Dropdown.update(scale=x)
-        col_tabs_scale.change(fn=resize_col_tabs, inputs=col_tabs_scale, outputs=col_tabs)
         def resize_chatbots(x, num_model_lock=0):
             if num_model_lock == 0:
@@ -881,7 +974,7 @@ def go_gradio(**kwargs):
         resize_chatbots_func = functools.partial(resize_chatbots, num_model_lock=len(text_outputs))
         text_outputs_height.change(fn=resize_chatbots_func, inputs=text_outputs_height,
-                                   outputs=[text_output, text_output2] + text_outputs)
         def update_dropdown(x):
             return gr.Dropdown.update(choices=x, value=[docs_state0[0]])
@@ -972,7 +1065,8 @@ def go_gradio(**kwargs):
                 if file.startswith('http') or file.startswith('https'):
                     # if file is online, then might as well use google(?)
                     document1 = file
-                    return gr.update(visible=True, value=f"""<iframe width="1000" height="800" src="https://docs.google.com/viewerng/viewer?url={document1}&embedded=true" frameborder="0" height="100%" width="100%">
 </iframe>
 """), dummy1, dummy1, dummy1
                 else:
@@ -995,9 +1089,11 @@ def go_gradio(**kwargs):
         refresh_sources1 = functools.partial(update_and_get_source_files_given_langchain_mode,
                                              **get_kwargs(update_and_get_source_files_given_langchain_mode,
-                                                          exclude_names=['db1', 'langchain_mode'],
                                                           **all_kwargs))
-        eventdb9 = refresh_sources_btn.click(fn=refresh_sources1, inputs=[my_db_state, langchain_mode],
                                              outputs=sources_text,
                                              api_name='refresh_sources' if allow_api else None)
@@ -1007,9 +1103,153 @@ def go_gradio(**kwargs):
         def close_admin(x):
             return gr.update(visible=not (x == admin_pass))
-        admin_btn.click(check_admin_pass, inputs=admin_pass_textbox, outputs=system_row, queue=False) \
             .then(close_admin, inputs=admin_pass_textbox, outputs=admin_row, queue=False)
         inputs_list, inputs_dict = get_inputs_list(all_kwargs, kwargs['model_lower'], model_id=1)
         inputs_list2, inputs_dict2 = get_inputs_list(all_kwargs, kwargs['model_lower'], model_id=2)
         from functools import partial
@@ -1021,11 +1261,11 @@ def go_gradio(**kwargs):
         def evaluate_nochat(*args1, default_kwargs1=None, str_api=False, **kwargs1):
             args_list = list(args1)
             if str_api:
-                user_kwargs = args_list[2]
                 assert isinstance(user_kwargs, str)
                 user_kwargs = ast.literal_eval(user_kwargs)
             else:
-                user_kwargs = {k: v for k, v in zip(eval_func_param_names, args_list[2:])}
             # only used for submit_nochat_api
             user_kwargs['chat'] = False
             if 'stream_output' not in user_kwargs:
@@ -1035,6 +1275,8 @@ def go_gradio(**kwargs):
                 user_kwargs['langchain_mode'] = 'Disabled'
             if 'langchain_action' not in user_kwargs:
                 user_kwargs['langchain_action'] = LangChainAction.QUERY.value
             set1 = set(list(default_kwargs1.keys()))
             set2 = set(eval_func_param_names)
@@ -1042,10 +1284,11 @@ def go_gradio(**kwargs):
             # correct ordering.  Note some things may not be in default_kwargs, so can't be default of user_kwargs.get()
             model_state1 = args_list[0]
             my_db_state1 = args_list[1]
             args_list = [user_kwargs[k] if k in user_kwargs and user_kwargs[k] is not None else default_kwargs1[k] for k
                          in eval_func_param_names]
             assert len(args_list) == len(eval_func_param_names)
-            args_list = [model_state1, my_db_state1] + args_list
             try:
                 for res_dict in evaluate(*tuple(args_list), **kwargs1):
@@ -1216,6 +1459,7 @@ def go_gradio(**kwargs):
             prompt_type1 = args_list[eval_func_param_names.index('prompt_type')]
             langchain_mode1 = args_list[eval_func_param_names.index('langchain_mode')]
             langchain_action1 = args_list[eval_func_param_names.index('langchain_action')]
             document_subset1 = args_list[eval_func_param_names.index('document_subset')]
             document_choice1 = args_list[eval_func_param_names.index('document_choice')]
             if not prompt_type1:
@@ -1248,10 +1492,7 @@ def go_gradio(**kwargs):
                     history[-1][1] = None
                 return history
             if user_message1 in ['', None, '\n']:
-                if langchain_action1 in LangChainAction.QUERY.value and \
-                        DocumentChoices.All.name != document_subset1 \
-                        or \
-                        langchain_mode1 in [LangChainMode.CHAT_LLM.value, LangChainMode.LLM.value]:
                     # reject non-retry submit/enter
                     return history
             user_message1 = fix_text_for_gradio(user_message1)
@@ -1298,10 +1539,12 @@ def go_gradio(**kwargs):
                  API only called for which_model=0, default for inputs_list, but rest should ignore inputs_list
             :return: last element is True if should run bot, False if should just yield history
             """
             # don't deepcopy, can contain model itself
             args_list = list(args).copy()
-            model_state1 = args_list[-3]
-            my_db_state1 = args_list[-2]
             history = args_list[-1]
             prompt_type1 = args_list[eval_func_param_names.index('prompt_type')]
             prompt_dict1 = args_list[eval_func_param_names.index('prompt_dict')]
@@ -1309,9 +1552,11 @@ def go_gradio(**kwargs):
             if model_state1['model'] is None or model_state1['model'] == no_model_str:
                 return history, None, None, None
-            args_list = args_list[:-3]  # only keep rest needed for evaluate()
             langchain_mode1 = args_list[eval_func_param_names.index('langchain_mode')]
             langchain_action1 = args_list[eval_func_param_names.index('langchain_action')]
             document_subset1 = args_list[eval_func_param_names.index('document_subset')]
             document_choice1 = args_list[eval_func_param_names.index('document_choice')]
             if not history:
@@ -1324,10 +1569,7 @@ def go_gradio(**kwargs):
                 instruction1 = history[-1][0]
                 history[-1][1] = None
             elif not instruction1:
-                if langchain_action1 in LangChainAction.QUERY.value and \
-                        DocumentChoices.All.name != document_choice1 \
-                        or \
-                        langchain_mode1 in [LangChainMode.CHAT_LLM.value, LangChainMode.LLM.value]:
                     # if not retrying, then reject empty query
                     return history, None, None, None
             elif len(history) > 0 and history[-1][1] not in [None, '']:
@@ -1344,7 +1586,9 @@ def go_gradio(**kwargs):
             chat1 = args_list[eval_func_param_names.index('chat')]
             model_max_length1 = get_model_max_length(model_state1)
-            context1 = history_to_context(history, langchain_mode1, prompt_type1, prompt_dict1, chat1,
                                           model_max_length1, memory_restriction_level,
                                           kwargs['keep_sources_in_context'])
             args_list[0] = instruction1  # override original instruction with history from user
@@ -1353,6 +1597,7 @@ def go_gradio(**kwargs):
             fun1 = partial(evaluate,
                            model_state1,
                            my_db_state1,
                            *tuple(args_list),
                            **kwargs_evaluate)
@@ -1398,24 +1643,26 @@ def go_gradio(**kwargs):
                 clear_torch_cache()
             return
-        def clear_embeddings(langchain_mode1, my_db):
             # clear any use of embedding that sits on GPU, else keeps accumulating GPU usage even if clear torch cache
-            if db_type == 'chroma' and langchain_mode1 not in ['ChatLLM', 'LLM', 'Disabled', None, '']:
                 from gpt_langchain import clear_embedding
                 db = dbs.get('langchain_mode1')
                 if db is not None and not isinstance(db, str):
                     clear_embedding(db)
-                if langchain_mode1 == LangChainMode.MY_DATA.value and my_db is not None:
-                    clear_embedding(my_db[0])
         def bot(*args, retry=False):
-            history, fun1, langchain_mode1, my_db_state1 = prep_bot(*args, retry=retry)
             try:
                 for res in get_response(fun1, history):
                     yield res
             finally:
                 clear_torch_cache()
-                clear_embeddings(langchain_mode1, my_db_state1)
         def all_bot(*args, retry=False, model_states1=None):
             args_list = list(args).copy()
@@ -1425,12 +1672,14 @@ def go_gradio(**kwargs):
             stream_output1 = args_list[eval_func_param_names.index('stream_output')]
             max_time1 = args_list[eval_func_param_names.index('max_time')]
             langchain_mode1 = args_list[eval_func_param_names.index('langchain_mode')]
-            my_db_state1 = None  # will be filled below by some bot
             try:
                 gen_list = []
                 for chatboti, (chatbot1, model_state1) in enumerate(zip(chatbots, model_states1)):
                     args_list1 = args_list0.copy()
-                    args_list1.insert(-1, model_state1)  # insert at -1 so is at -2
                     # if at start, have None in response still, replace with '' so client etc. acts like normal
                     # assumes other parts of code treat '' and None as if no response yet from bot
                     # can't do this later in bot code as racy with threaded generators
@@ -1440,8 +1689,8 @@ def go_gradio(**kwargs):
                     # so consistent with prep_bot()
                     # with model_state1 at -3, my_db_state1 at -2, and history(chatbot) at -1
                     # langchain_mode1 and my_db_state1 should be same for every bot
-                    history, fun1, langchain_mode1, my_db_state1 = prep_bot(*tuple(args_list1), retry=retry,
-                                                                            which_model=chatboti)
                     gen1 = get_response(fun1, history)
                     if stream_output1:
                         gen1 = TimeoutIterator(gen1, timeout=0.01, sentinel=None, raise_on_exception=False)
@@ -1487,7 +1736,7 @@ def go_gradio(**kwargs):
                         print("Generate exceptions: %s" % exceptions, flush=True)
             finally:
                 clear_torch_cache()
-                clear_embeddings(langchain_mode1, my_db_state1)
         # NORMAL MODEL
         user_args = dict(fn=functools.partial(user, sanitize_user_prompt=kwargs['sanitize_user_prompt']),
@@ -1495,11 +1744,11 @@ def go_gradio(**kwargs):
                          outputs=text_output,
                          )
         bot_args = dict(fn=bot,
-                        inputs=inputs_list + [model_state, my_db_state] + [text_output],
                         outputs=[text_output, chat_exception_text],
                         )
         retry_bot_args = dict(fn=functools.partial(bot, retry=True),
-                              inputs=inputs_list + [model_state, my_db_state] + [text_output],
                               outputs=[text_output, chat_exception_text],
                               )
         retry_user_args = dict(fn=functools.partial(user, retry=True),
@@ -1517,11 +1766,11 @@ def go_gradio(**kwargs):
                           outputs=text_output2,
                           )
         bot_args2 = dict(fn=bot,
-                         inputs=inputs_list2 + [model_state2, my_db_state] + [text_output2],
                          outputs=[text_output2, chat_exception_text],
                          )
         retry_bot_args2 = dict(fn=functools.partial(bot, retry=True),
-                               inputs=inputs_list2 + [model_state2, my_db_state] + [text_output2],
                                outputs=[text_output2, chat_exception_text],
                                )
         retry_user_args2 = dict(fn=functools.partial(user, retry=True),
@@ -1542,11 +1791,11 @@ def go_gradio(**kwargs):
                              outputs=text_outputs,
                              )
         all_bot_args = dict(fn=functools.partial(all_bot, model_states1=model_states),
-                            inputs=inputs_list + [my_db_state] + text_outputs,
                             outputs=text_outputs + [chat_exception_text],
                             )
         all_retry_bot_args = dict(fn=functools.partial(all_bot, model_states1=model_states, retry=True),
-                                  inputs=inputs_list + [my_db_state] + text_outputs,
                                   outputs=text_outputs + [chat_exception_text],
                                   )
         all_retry_user_args = dict(fn=functools.partial(all_user, retry=True,
@@ -1708,6 +1957,11 @@ def go_gradio(**kwargs):
         def get_short_chat(x, short_chats, short_len=20, words=4):
             if x and len(x[0]) == 2 and x[0][0] is not None:
                 short_chat = ' '.join(x[0][0][:short_len].split(' ')[:words]).strip()
                 short_chat = dedup(short_chat, short_chats)
             else:
                 short_chat = None
@@ -1775,14 +2029,12 @@ def go_gradio(**kwargs):
                     already_exists = any([is_chat_same(chat_list, x) for x in old_chat_lists])
                     if not already_exists:
                         chat_state1[short_chat] = chat_list.copy()
-                # clear chat_list so saved and then new conversation starts
-                # FIXME: seems less confusing to clear, since have clear button right next
-                # chat_list = [[]] * len(chat_list)
-            if not chat_is_list:
-                ret_list = chat_list + [chat_state1]
-            else:
-                ret_list = [chat_list] + [chat_state1]
-            return tuple(ret_list)
         def switch_chat(chat_key, chat_state1, num_model_lock=0):
             chosen_chat = chat_state1[chat_key]
@@ -1813,7 +2065,7 @@ def go_gradio(**kwargs):
         remove_chat_event = remove_chat_btn.click(remove_chat,
                                                   inputs=[radio_chats, chat_state], outputs=[radio_chats, chat_state],
-                                                  queue=False)
         def get_chats1(chat_state1):
             base = 'chats'
@@ -1844,7 +2096,7 @@ def go_gradio(**kwargs):
                         new_chats = json.loads(f.read())
                         for chat1_k, chat1_v in new_chats.items():
                             # ignore chat1_k, regenerate and de-dup to avoid loss
-                            _, chat_state1 = save_chat(chat1_v, chat_state1, chat_is_list=True)
                 except BaseException as e:
                     t, v, tb = sys.exc_info()
                     ex = ''.join(traceback.format_exception(t, v, tb))
@@ -1870,24 +2122,17 @@ def go_gradio(**kwargs):
             .then(deselect_radio_chats, inputs=None, outputs=radio_chats, queue=False) \
             .then(clear_scores, outputs=[score_text, score_text2, score_text_nochat])
-        def update_radio_chats(chat_state1):
-            # reverse so newest at top
-            choices = list(chat_state1.keys()).copy()
-            choices.reverse()
-            return gr.update(choices=choices, value=None)
         clear_event = save_chat_btn.click(save_chat,
                                           inputs=[text_output, text_output2] + text_outputs + [chat_state],
-                                          outputs=[text_output, text_output2] + text_outputs + [chat_state],
-                                          api_name='save_chat' if allow_api else None) \
-            .then(update_radio_chats, inputs=chat_state, outputs=radio_chats,
-                  api_name='update_chats' if allow_api else None) \
-            .then(clear_scores, outputs=[score_text, score_text2, score_text_nochat])
         # NOTE: clear of instruction/iinput for nochat has to come after score,
         # because score for nochat consumes actual textbox, while chat consumes chat history filled by user()
         no_chat_args = dict(fn=fun,
-                            inputs=[model_state, my_db_state] + inputs_list,
                             outputs=text_output_nochat,
                             queue=queue,
                             )
@@ -1906,7 +2151,8 @@ def go_gradio(**kwargs):
             .then(clear_torch_cache)
         submit_event_nochat_api = submit_nochat_api.click(fun_with_dict_str,
-                                                          inputs=[model_state, my_db_state, inputs_dict_str],
                                                           outputs=text_output_nochat_api,
                                                           queue=True,  # required for generator
                                                           api_name='submit_nochat_api' if allow_api else None) \
@@ -2156,6 +2402,8 @@ def go_gradio(**kwargs):
                 print("Exception: %s" % str(e), flush=True)
             return json.dumps(sys_dict)
         get_system_info_dict_func = functools.partial(get_system_info_dict, **all_kwargs)
         system_dict_event = system_btn2.click(get_system_info_dict_func,
@@ -2185,12 +2433,15 @@ def go_gradio(**kwargs):
             else:
                 tokenizer = None
             if tokenizer is not None:
-                langchain_mode1 = 'ChatLLM'
                 # fake user message to mimic bot()
                 chat1 = copy.deepcopy(chat1)
                 chat1 = chat1 + [['user_message1', None]]
                 model_max_length1 = tokenizer.model_max_length
-                context1 = history_to_context(chat1, langchain_mode1, prompt_type1, prompt_dict1, chat1,
                                               model_max_length1,
                                               memory_restriction_level1, keep_sources_in_context1)
                 return str(tokenizer(context1, return_tensors="pt")['input_ids'].shape[1])
@@ -2220,7 +2471,7 @@ def go_gradio(**kwargs):
                        ,
                        queue=False, api_name='stop' if allow_api else None).then(clear_torch_cache, queue=False)
-        demo.load(None, None, None, _js=get_dark_js() if kwargs['h2ocolors'] and False else None)  # light best
     demo.queue(concurrency_count=kwargs['concurrency_count'], api_open=kwargs['api_open'])
     favicon_path = "h2o-logo.svg"
@@ -2235,7 +2486,8 @@ def go_gradio(**kwargs):
         # FIXME: disable for gptj, langchain or gpt4all modify print itself
         # FIXME: and any multi-threaded/async print will enter model output!
         scheduler.add_job(func=ping, trigger="interval", seconds=60)
-    scheduler.add_job(func=ping_gpu, trigger="interval", seconds=60 * 10)
     scheduler.start()
     # import control
@@ -2254,9 +2506,6 @@ def go_gradio(**kwargs):
         demo.block_thread()
-input_args_list = ['model_state', 'my_db_state']
 def get_inputs_list(inputs_dict, model_lower, model_id=1):
     """
     map gradio objects in locals() to inputs for evaluate().
@@ -2290,8 +2539,9 @@ def get_inputs_list(inputs_dict, model_lower, model_id=1):
     return inputs_list, inputs_dict_out
-def get_sources(db1, langchain_mode, dbs=None, docs_state0=None):
-    set_userid(db1)
     if langchain_mode in ['ChatLLM', 'LLM']:
         source_files_added = "NA"
@@ -2300,7 +2550,8 @@ def get_sources(db1, langchain_mode, dbs=None, docs_state0=None):
         source_files_added = "Not showing wiki_full, takes about 20 seconds and makes 4MB file." \
                              "  Ask jon.mckinney@h2o.ai for file if required."
         source_list = []
-    elif langchain_mode == 'MyData' and len(db1) > 0 and db1[0] is not None:
         from gpt_langchain import get_metadatas
         metadatas = get_metadatas(db1[0])
         source_list = sorted(set([x['source'] for x in metadatas]))
@@ -2331,14 +2582,13 @@ def set_userid(db1):
         db1[1] = str(uuid.uuid4())
-def update_user_db(file, db1, chunk, chunk_size, langchain_mode, dbs=None, **kwargs):
-    set_userid(db1)
     if file is None:
         raise RuntimeError("Don't use change, use input")
     try:
-        return _update_user_db(file, db1=db1, chunk=chunk, chunk_size=chunk_size,
                                langchain_mode=langchain_mode, dbs=dbs,
                                **kwargs)
     except BaseException as e:
@@ -2369,25 +2619,30 @@ def get_lock_file(db1, langchain_mode):
     user_id = db1[1]
     base_path = 'locks'
     makedirs(base_path)
-    lock_file = "db_%s_%s.lock" % (langchain_mode.replace(' ', '_'), user_id)
     return lock_file
 def _update_user_db(file,
-                    db1=None,
                     chunk=None, chunk_size=None,
-                    dbs=None, db_type=None, langchain_mode='UserData',
-                    user_path=None,
                     use_openai_embedding=None,
                     hf_embedding_model=None,
                     caption_loader=None,
                     enable_captions=None,
                     captions_model=None,
                     enable_ocr=None,
                     verbose=None,
                     is_url=None, is_txt=None,
-                    n_jobs=-1):
-    assert db1 is not None
     assert chunk is not None
     assert chunk_size is not None
     assert use_openai_embedding is not None
@@ -2396,10 +2651,9 @@ def _update_user_db(file,
     assert enable_captions is not None
     assert captions_model is not None
     assert enable_ocr is not None
     assert verbose is not None
-    set_userid(db1)
     if dbs is None:
         dbs = {}
     assert isinstance(dbs, dict), "Wrong type for dbs: %s" % str(type(dbs))
@@ -2417,17 +2671,22 @@ def _update_user_db(file,
     if langchain_mode == LangChainMode.DISABLED.value:
         return None, langchain_mode, get_source_files(), ""
-    if langchain_mode in [LangChainMode.CHAT_LLM.value, LangChainMode.CHAT_LLM.value]:
         # then switch to MyData, so langchain_mode also becomes way to select where upload goes
         # but default to mydata if nothing chosen, since safest
-        langchain_mode = LangChainMode.MY_DATA.value
-    if langchain_mode == 'UserData' and user_path is not None:
         # move temp files from gradio upload to stable location
         for fili, fil in enumerate(file):
-            if isinstance(fil, str):
-                if fil.startswith('/tmp/gradio/'):
-                    new_fil = os.path.join(user_path, os.path.basename(fil))
                     if os.path.isfile(new_fil):
                         remove(new_fil)
                     try:
@@ -2447,15 +2706,22 @@ def _update_user_db(file,
                            enable_captions=enable_captions,
                            captions_model=captions_model,
                            enable_ocr=enable_ocr,
                            caption_loader=caption_loader,
                            )
     exceptions = [x for x in sources if x.metadata.get('exception')]
     exceptions_strs = [x.metadata['exception'] for x in exceptions]
     sources = [x for x in sources if 'exception' not in x.metadata]
-    lock_file = get_lock_file(db1, langchain_mode)
     with filelock.FileLock(lock_file):
-        if langchain_mode == 'MyData':
             if db1[0] is not None:
                 # then add
                 db, num_new_sources, new_sources_metadata = add_to_db(db1[0], sources, db_type=db_type,
@@ -2465,7 +2731,8 @@ def _update_user_db(file,
                 # in testing expect:
                 # assert len(db1) == 2 and db1[1] is None, "Bad MyData db: %s" % db1
                 # for production hit, when user gets clicky:
-                assert len(db1) == 2, "Bad MyData db: %s" % db1
                 # then create
                 # if added has to original state and didn't change, then would be shared db for all users
                 persist_directory = os.path.join(scratch_base_dir, 'db_dir_%s_%s' % (langchain_mode, db1[1]))
@@ -2487,7 +2754,7 @@ def _update_user_db(file,
                                                                       use_openai_embedding=use_openai_embedding,
                                                                       hf_embedding_model=hf_embedding_model)
             else:
-                # then create
                 db = get_db(sources, use_openai_embedding=use_openai_embedding,
                             db_type=db_type,
                             persist_directory=persist_directory,
@@ -2501,14 +2768,15 @@ def _update_user_db(file,
             return None, langchain_mode, source_files_added, '\n'.join(exceptions_strs)
-def get_db(db1, langchain_mode, dbs=None):
-    lock_file = get_lock_file(db1, langchain_mode)
     with filelock.FileLock(lock_file):
         if langchain_mode in ['wiki_full']:
             # NOTE: avoid showing full wiki.  Takes about 30 seconds over about 90k entries, but not useful for now
             db = None
-        elif langchain_mode == 'MyData' and len(db1) > 0 and db1[0] is not None:
             db = db1[0]
         elif dbs is not None and langchain_mode in dbs and dbs[langchain_mode] is not None:
             db = dbs[langchain_mode]
@@ -2517,8 +2785,8 @@ def get_db(db1, langchain_mode, dbs=None):
     return db
-def get_source_files_given_langchain_mode(db1, langchain_mode='UserData', dbs=None):
-    db = get_db(db1, langchain_mode, dbs=dbs)
     if langchain_mode in ['ChatLLM', 'LLM'] or db is None:
         return "Sources: N/A"
     return get_source_files(db=db, exceptions=None)
@@ -2617,11 +2885,19 @@ def get_source_files(db=None, exceptions=None, metadatas=None):
     return source_files_added
-def update_and_get_source_files_given_langchain_mode(db1, langchain_mode, dbs=None, first_para=None,
-                                                     text_limit=None, chunk=None, chunk_size=None,
-                                                     user_path=None, db_type=None, load_db_if_exists=None,
                                                      n_jobs=None, verbose=None):
-    db = get_db(db1, langchain_mode, dbs=dbs)
     from gpt_langchain import make_db
     db, num_new_sources, new_sources_metadata = make_db(use_openai_embedding=False,
@@ -2630,11 +2906,27 @@ def update_and_get_source_files_given_langchain_mode(db1, langchain_mode, dbs=No
                                                         chunk=chunk,
                                                         chunk_size=chunk_size,
                                                         langchain_mode=langchain_mode,
-                                                        user_path=user_path,
                                                         db_type=db_type,
                                                         load_db_if_exists=load_db_if_exists,
                                                         db=db,
                                                         n_jobs=n_jobs,
                                                         verbose=verbose)
     # return only new sources with text saying such
     return get_source_files(db=None, exceptions=None, metadatas=new_sources_metadata)

 fix_pydantic_duplicate_validators_error()
+from enums import DocumentSubset, no_model_str, no_lora_str, no_server_str, LangChainAction, LangChainMode, \
+    DocumentChoice, langchain_modes_intrinsic
 from gradio_themes import H2oTheme, SoftTheme, get_h2o_title, get_simple_title, get_dark_js, spacing_xsm, radius_xsm, \
     text_xsm
 from prompter import prompt_type_to_model_name, prompt_types_strings, inv_prompt_type_to_model_lower, non_hf_types, \
     get_prompt
+from utils import flatten_list, zip_data, s3up, clear_torch_cache, get_torch_allocated, system_info_print, \
+    ping, get_short_name, makedirs, get_kwargs, remove, system_info, ping_gpu, get_url, get_local_ip, \
+    save_collection_names
+from gen import get_model, languages_covered, evaluate, score_qa, inputs_kwargs_list, scratch_base_dir, \
+    get_max_max_new_tokens, get_minmax_top_k_docs, history_to_context, langchain_actions, langchain_agents_list, \
+    update_langchain
+from evaluate_params import eval_func_param_names, no_default_param_names, eval_func_param_names_defaults, \
+    input_args_list
 from apscheduler.schedulers.background import BackgroundScheduler
     memory_restriction_level = kwargs['memory_restriction_level']
     n_gpus = kwargs['n_gpus']
     admin_pass = kwargs['admin_pass']
     model_states = kwargs['model_states']
     dbs = kwargs['dbs']
     db_type = kwargs['db_type']
     visible_langchain_actions = kwargs['visible_langchain_actions']
+    visible_langchain_agents = kwargs['visible_langchain_agents']
     allow_upload_to_user_data = kwargs['allow_upload_to_user_data']
     allow_upload_to_my_data = kwargs['allow_upload_to_my_data']
     enable_sources_list = kwargs['enable_sources_list']
     enable_captions = kwargs['enable_captions']
     captions_model = kwargs['captions_model']
     enable_ocr = kwargs['enable_ocr']
+    enable_pdf_ocr = kwargs['enable_pdf_ocr']
     caption_loader = kwargs['caption_loader']
+    # for dynamic state per user session in gradio
+    model_state0 = kwargs['model_state0']
+    score_model_state0 = kwargs['score_model_state0']
+    my_db_state0 = kwargs['my_db_state0']
+    selection_docs_state0 = kwargs['selection_docs_state0']
+    # for evaluate defaults
+    langchain_modes0 = kwargs['langchain_modes']
+    visible_langchain_modes0 = kwargs['visible_langchain_modes']
+    langchain_mode_paths0 = kwargs['langchain_mode_paths']
     # easy update of kwargs needed for evaluate() etc.
     queue = True
     allow_upload = allow_upload_to_user_data or allow_upload_to_my_data
                                    " use Enter for multiple input lines)"
     title = 'h2oGPT'
+    description = """<iframe src="https://ghbtns.com/github-btn.html?user=h2oai&repo=h2ogpt&type=star&count=true&size=small" frameborder="0" scrolling="0" width="250" height="20" title="GitHub"></iframe><small><a href="https://github.com/h2oai/h2ogpt">h2oGPT</a>  <a href="https://github.com/h2oai/h2o-llmstudio">H2O LLM Studio</a><br><a href="https://huggingface.co/h2oai">🤗 Models</a>"""
+    description_bottom = "If this host is busy, try<br>[Multi-Model](https://gpt.h2o.ai)<br>[Falcon 40B](https://falcon.h2o.ai)<br>[Vicuna 33B](https://wizardvicuna.h2o.ai)<br>[MPT 30B-Chat](https://mpt.h2o.ai)<br>[HF Spaces1](https://huggingface.co/spaces/h2oai/h2ogpt-chatbot)<br>[HF Spaces2](https://huggingface.co/spaces/h2oai/h2ogpt-chatbot2)<br>"
     if is_hf:
         description_bottom += '''<a href="https://huggingface.co/spaces/h2oai/h2ogpt-chatbot?duplicate=true"><img src="https://bit.ly/3gLdBN6" style="white-space: nowrap" alt="Duplicate Space"></a>'''
+    task_info_md = ''
     css_code = get_css(kwargs)
     if kwargs['gradio_offline_level'] >= 0:
     demo = gr.Blocks(theme=theme, css=css_code, title="h2oGPT", analytics_enabled=False)
     callback = gr.CSVLogger()
+    model_options0 = flatten_list(list(prompt_type_to_model_name.values())) + kwargs['extra_model_options']
+    if kwargs['base_model'].strip() not in model_options0:
+        model_options0 = [kwargs['base_model'].strip()] + model_options0
     lora_options = kwargs['extra_lora_options']
     if kwargs['lora_weights'].strip() not in lora_options:
         lora_options = [kwargs['lora_weights'].strip()] + lora_options
     # always add in no lora case
     # add fake space so doesn't go away in gradio dropdown
+    model_options0 = [no_model_str] + model_options0
     lora_options = [no_lora_str] + lora_options
     server_options = [no_server_str] + server_options
     # always add in no model case so can free memory
         # else gets input_list at time of submit that is old, and shows up as truncated in chatbot
         return x
+    def allow_empty_instruction(langchain_mode1, document_subset1, langchain_action1):
+        allow = False
+        allow |= langchain_action1 not in LangChainAction.QUERY.value
+        allow |= document_subset1 in DocumentSubset.TopKSources.name
+        if langchain_mode1 in [LangChainMode.LLM.value]:
+            allow = False
+        return allow
     with demo:
         # avoid actual model/tokenizer here or anything that would be bad to deepcopy
         # https://github.com/gradio-app/gradio/issues/3558
                  prompt_dict=kwargs['prompt_dict'],
                  )
         )
+        def update_langchain_mode_paths(db1s, selection_docs_state1):
+            if allow_upload_to_my_data:
+                selection_docs_state1['langchain_mode_paths'].update({k: None for k in db1s})
+            dup = selection_docs_state1['langchain_mode_paths'].copy()
+            for k, v in dup.items():
+                if k not in selection_docs_state1['visible_langchain_modes']:
+                    selection_docs_state1['langchain_mode_paths'].pop(k)
+            return selection_docs_state1
+        # Setup some gradio states for per-user dynamic state
         model_state2 = gr.State(kwargs['model_state_none'].copy())
+        model_options_state = gr.State([model_options0])
         lora_options_state = gr.State([lora_options])
         server_options_state = gr.State([server_options])
+        my_db_state = gr.State(my_db_state0)
         chat_state = gr.State({})
+        docs_state00 = kwargs['document_choice'] + [DocumentChoice.ALL.value]
         docs_state0 = []
         [docs_state0.append(x) for x in docs_state00 if x not in docs_state0]
         docs_state = gr.State(docs_state0)
         viewable_docs_state0 = []
         viewable_docs_state = gr.State(viewable_docs_state0)
+        selection_docs_state0 = update_langchain_mode_paths(my_db_state0, selection_docs_state0)
+        selection_docs_state = gr.State(selection_docs_state0)
         gr.Markdown(f"""
             {get_h2o_title(title, description) if kwargs['h2ocolors'] else get_simple_title(title, description)}
             """)
             'model_lock'] else "Response Scores: %s" % nas
         if kwargs['langchain_mode'] != LangChainMode.DISABLED.value:
+            extra_prompt_form = ".  For summarization, no query required, just click submit"
         else:
             extra_prompt_form = ""
         if kwargs['input_lines'] > 1:
         else:
             instruction_label = "Enter to Submit, Shift-Enter for more lines%s" % extra_prompt_form
+        def get_langchain_choices(selection_docs_state1):
+            langchain_modes = selection_docs_state1['langchain_modes']
+            visible_langchain_modes = selection_docs_state1['visible_langchain_modes']
+            if is_hf:
+                # don't show 'wiki' since only usually useful for internal testing at moment
+                no_show_modes = ['Disabled', 'wiki']
+            else:
+                no_show_modes = ['Disabled']
+            allowed_modes = visible_langchain_modes.copy()
+            # allowed_modes = [x for x in allowed_modes if x in dbs]
+            allowed_modes += ['LLM']
+            if allow_upload_to_my_data and 'MyData' not in allowed_modes:
+                allowed_modes += ['MyData']
+            if allow_upload_to_user_data and 'UserData' not in allowed_modes:
+                allowed_modes += ['UserData']
+            choices = [x for x in langchain_modes if x in allowed_modes and x not in no_show_modes]
+            return choices
+        def get_df_langchain_mode_paths(selection_docs_state1):
+            langchain_mode_paths = selection_docs_state1['langchain_mode_paths']
+            if langchain_mode_paths:
+                df = pd.DataFrame.from_dict(langchain_mode_paths.items(), orient='columns')
+                df.columns = ['Collection', 'Path']
+            else:
+                df = pd.DataFrame(None)
+            return df
         normal_block = gr.Row(visible=not base_wanted, equal_height=False)
         with normal_block:
             side_bar = gr.Column(elem_id="col_container", scale=1, min_width=100)
                                                     scale=1,
                                                     min_width=0,
                                                     elem_id="warning", elem_classes="feedback")
+                            fileup_output_text = gr.Textbox(visible=False)
                     url_visible = kwargs['langchain_mode'] != 'Disabled' and allow_upload and enable_url_upload
                     url_label = 'URL/ArXiv' if have_arxiv else 'URL'
                     url_text = gr.Textbox(label=url_label,
                                                 visible=text_visible)
                     github_textbox = gr.Textbox(label="Github URL", visible=False)  # FIXME WIP
                 database_visible = kwargs['langchain_mode'] != 'Disabled'
+                with gr.Accordion("Resources", open=False, visible=database_visible):
+                    langchain_choices0 = get_langchain_choices(selection_docs_state0)
                     langchain_mode = gr.Radio(
+                        langchain_choices0,
                         value=kwargs['langchain_mode'],
                         label="Collections",
                         show_label=True,
                         visible=kwargs['langchain_mode'] != 'Disabled',
                         min_width=100)
+                    add_chat_history_to_context = gr.Checkbox(label="Chat History",
+                                                              value=kwargs['add_chat_history_to_context'])
+                    document_subset = gr.Radio([x.name for x in DocumentSubset],
                                                label="Subset",
+                                               value=DocumentSubset.Relevant.name,
                                                interactive=True,
                                                )
                     allowed_actions = [x for x in langchain_actions if x in visible_langchain_actions]
                         value=allowed_actions[0] if len(allowed_actions) > 0 else None,
                         label="Action",
                         visible=True)
+                    allowed_agents = [x for x in langchain_agents_list if x in visible_langchain_agents]
+                    langchain_agents = gr.Dropdown(
+                        langchain_agents_list,
+                        value=kwargs['langchain_agents'],
+                        label="Agents",
+                        multiselect=True,
+                        interactive=True,
+                        visible=False)  # WIP
             col_tabs = gr.Column(elem_id="col_container", scale=10)
             with (col_tabs, gr.Tabs()):
                 with gr.TabItem("Chat"):
                                     mw1 = 50
                                     mw2 = 50
                                     with gr.Column(min_width=mw1):
+                                        submit = gr.Button(value='Submit', variant='primary', size='sm',
                                                            min_width=mw1)
+                                        stop_btn = gr.Button(value="Stop", variant='secondary', size='sm',
                                                              min_width=mw1)
                                         save_chat_btn = gr.Button("Save", size='sm', min_width=mw1)
                                     with gr.Column(min_width=mw2):
                 with gr.TabItem("Document Selection"):
                     document_choice = gr.Dropdown(docs_state0,
                                                   label="Select Subset of Document(s) %s" % file_types_str,
+                                                  value=[DocumentChoice.ALL.value],
                                                   interactive=True,
                                                   multiselect=True,
                                                   visible=kwargs['langchain_mode'] != 'Disabled',
                                                   )
                     sources_visible = kwargs['langchain_mode'] != 'Disabled' and enable_sources_list
                     with gr.Row():
+                        with gr.Column(scale=1):
+                            get_sources_btn = gr.Button(value="Update UI with Document(s) from DB", scale=0, size='sm',
+                                                        visible=sources_visible)
+                            show_sources_btn = gr.Button(value="Show Sources from DB", scale=0, size='sm',
+                                                         visible=sources_visible)
+                            refresh_sources_btn = gr.Button(value="Update DB with new/changed files on disk", scale=0,
+                                                            size='sm',
+                                                            visible=sources_visible and allow_upload_to_user_data)
+                        with gr.Column(scale=4):
+                            pass
+                    with gr.Row():
+                        with gr.Column(scale=1):
+                            add_placeholder = "e.g. UserData2, user_path2 (optional)" \
+                                if not is_public else "e.g. MyData2"
+                            remove_placeholder = "e.g. UserData2" if not is_public else "e.g. MyData2"
+                            new_langchain_mode_text = gr.Textbox(value="", visible=allow_upload_to_user_data or
+                                                                                   allow_upload_to_my_data,
+                                                                 label='Add Collection',
+                                                                 placeholder=add_placeholder,
+                                                                 interactive=True)
+                            remove_langchain_mode_text = gr.Textbox(value="", visible=allow_upload_to_user_data or
+                                                                                      allow_upload_to_my_data,
+                                                                    label='Remove Collection',
+                                                                    placeholder=remove_placeholder,
+                                                                    interactive=True)
+                            load_langchain = gr.Button(value="Load LangChain State", scale=0, size='sm',
+                                                       visible=allow_upload_to_user_data)
+                        with gr.Column(scale=1):
+                            df0 = get_df_langchain_mode_paths(selection_docs_state0)
+                            langchain_mode_path_text = gr.Dataframe(value=df0,
+                                                                    visible=allow_upload_to_user_data or
+                                                                            allow_upload_to_my_data,
+                                                                    label='LangChain Mode-Path',
+                                                                    show_label=False,
+                                                                    interactive=False)
+                        with gr.Column(scale=4):
+                            pass
                     sources_row = gr.Row(visible=kwargs['langchain_mode'] != 'Disabled' and enable_sources_list,
                                          equal_height=False)
                                                                value=None,
                                                                interactive=True,
                                                                multiselect=False,
+                                                               visible=True,
                                                                )
                         with gr.Column(scale=4):
                             pass
                             side_bar_btn = gr.Button("Toggle SideBar", variant="secondary", size="sm")
                             submit_buttons_btn = gr.Button("Toggle Submit Buttons", variant="secondary", size="sm")
                             col_tabs_scale = gr.Slider(minimum=1, maximum=20, value=10, step=1, label='Window Size')
+                            text_outputs_height = gr.Slider(minimum=100, maximum=2000, value=kwargs['height'] or 400,
+                                                            step=50, label='Chat Height')
                             dark_mode_btn = gr.Button("Dark Mode", variant="secondary", size="sm")
                         with gr.Column(scale=4):
                             pass
+                    system_visible0 = not is_public and not admin_pass
                     admin_row = gr.Row()
                     with admin_row:
                         with gr.Column(scale=1):
+                            admin_pass_textbox = gr.Textbox(label="Admin Password", type='password',
+                                                            visible=not system_visible0)
                         with gr.Column(scale=4):
                             pass
+                    system_row = gr.Row(visible=system_visible0)
                     with system_row:
                         with gr.Column():
                             with gr.Row():
             else:
                 return tuple([gr.update(interactive=True)] * len(args))
+        # Add to UserData or custom user db
         update_db_func = functools.partial(update_user_db,
                                            dbs=dbs,
                                            db_type=db_type,
                                            use_openai_embedding=use_openai_embedding,
                                            hf_embedding_model=hf_embedding_model,
                                            captions_model=captions_model,
+                                           enable_captions=enable_captions,
                                            caption_loader=caption_loader,
+                                           enable_ocr=enable_ocr,
+                                           enable_pdf_ocr=enable_pdf_ocr,
                                            verbose=kwargs['verbose'],
                                            n_jobs=kwargs['n_jobs'],
                                            )
         add_file_outputs = [fileup_output, langchain_mode]
         add_file_kwargs = dict(fn=update_db_func,
+                               inputs=[fileup_output, my_db_state, selection_docs_state, chunk, chunk_size,
+                                       langchain_mode],
                                outputs=add_file_outputs + [sources_text, doc_exception_text],
                                queue=queue,
                                api_name='add_file' if allow_api and allow_upload_to_user_data else None)
         eventdb1b = eventdb1.then(make_interactive, inputs=add_file_outputs, outputs=add_file_outputs,
                                   show_progress='minimal')
+        # deal with challenge to have fileup_output itself as input
+        add_file_kwargs2 = dict(fn=update_db_func,
+                                inputs=[fileup_output_text, my_db_state, selection_docs_state, chunk, chunk_size,
+                                        langchain_mode],
+                                outputs=add_file_outputs + [sources_text, doc_exception_text],
+                                queue=queue,
+                                api_name='add_file_api' if allow_api and allow_upload_to_user_data else None)
+        eventdb1_api = fileup_output_text.submit(**add_file_kwargs2, show_progress='full')
         # note for update_user_db_func output is ignored for db
         def clear_textbox():
         add_url_outputs = [url_text, langchain_mode]
         add_url_kwargs = dict(fn=update_user_db_url_func,
+                              inputs=[url_text, my_db_state, selection_docs_state, chunk, chunk_size,
+                                      langchain_mode],
                               outputs=add_url_outputs + [sources_text, doc_exception_text],
                               queue=queue,
                               api_name='add_url' if allow_api and allow_upload_to_user_data else None)
         update_user_db_txt_func = functools.partial(update_db_func, is_txt=True)
         add_text_outputs = [user_text_text, langchain_mode]
         add_text_kwargs = dict(fn=update_user_db_txt_func,
+                               inputs=[user_text_text, my_db_state, selection_docs_state, chunk, chunk_size,
+                                       langchain_mode],
                                outputs=add_text_outputs + [sources_text, doc_exception_text],
                                queue=queue,
                                api_name='add_text' if allow_api and allow_upload_to_user_data else None
         eventdb3 = eventdb3b.then(**add_text_kwargs, show_progress='full')
         eventdb3c = eventdb3.then(make_interactive, inputs=add_text_outputs, outputs=add_text_outputs,
                                   show_progress='minimal')
+        db_events = [eventdb1a, eventdb1, eventdb1b, eventdb1_api,
                      eventdb2a, eventdb2, eventdb2b, eventdb2c,
                      eventdb3a, eventdb3b, eventdb3, eventdb3c]
         # if change collection source, must clear doc selections from it to avoid inconsistency
         def clear_doc_choice():
+            return gr.Dropdown.update(choices=docs_state0, value=DocumentChoice.ALL.value)
         langchain_mode.change(clear_doc_choice, inputs=None, outputs=document_choice, queue=False)
         def resize_col_tabs(x):
             return gr.Dropdown.update(scale=x)
+        col_tabs_scale.change(fn=resize_col_tabs, inputs=col_tabs_scale, outputs=col_tabs, queue=False)
         def resize_chatbots(x, num_model_lock=0):
             if num_model_lock == 0:
         resize_chatbots_func = functools.partial(resize_chatbots, num_model_lock=len(text_outputs))
         text_outputs_height.change(fn=resize_chatbots_func, inputs=text_outputs_height,
+                                   outputs=[text_output, text_output2] + text_outputs, queue=False)
         def update_dropdown(x):
             return gr.Dropdown.update(choices=x, value=[docs_state0[0]])
                 if file.startswith('http') or file.startswith('https'):
                     # if file is online, then might as well use google(?)
                     document1 = file
+                    return gr.update(visible=True,
+                                     value=f"""<iframe width="1000" height="800" src="https://docs.google.com/viewerng/viewer?url={document1}&embedded=true" frameborder="0" height="100%" width="100%">
 </iframe>
 """), dummy1, dummy1, dummy1
                 else:
         refresh_sources1 = functools.partial(update_and_get_source_files_given_langchain_mode,
                                              **get_kwargs(update_and_get_source_files_given_langchain_mode,
+                                                          exclude_names=['db1s', 'langchain_mode', 'chunk',
+                                                                         'chunk_size'],
                                                           **all_kwargs))
+        eventdb9 = refresh_sources_btn.click(fn=refresh_sources1,
+                                             inputs=[my_db_state, langchain_mode, chunk, chunk_size],
                                              outputs=sources_text,
                                              api_name='refresh_sources' if allow_api else None)
         def close_admin(x):
             return gr.update(visible=not (x == admin_pass))
+        admin_pass_textbox.submit(check_admin_pass, inputs=admin_pass_textbox, outputs=system_row, queue=False) \
             .then(close_admin, inputs=admin_pass_textbox, outputs=admin_row, queue=False)
+        def add_langchain_mode(db1s, selection_docs_state1, langchain_mode1, y):
+            for k in db1s:
+                set_userid(db1s[k])
+            langchain_modes = selection_docs_state1['langchain_modes']
+            langchain_mode_paths = selection_docs_state1['langchain_mode_paths']
+            visible_langchain_modes = selection_docs_state1['visible_langchain_modes']
+            user_path = None
+            valid = True
+            y2 = y.strip().replace(' ', '').split(',')
+            if len(y2) >= 1:
+                langchain_mode2 = y2[0]
+                if len(langchain_mode2) >= 3 and langchain_mode2.isalnum():
+                    # real restriction is:
+                    # ValueError: Expected collection name that (1) contains 3-63 characters, (2) starts and ends with an alphanumeric character, (3) otherwise contains only alphanumeric characters, underscores or hyphens (-), (4) contains no two consecutive periods (..) and (5) is not a valid IPv4 address, got me
+                    # but just make simpler
+                    user_path = y2[1] if len(y2) > 1 else None  # assume scratch if don't have user_path
+                    if user_path in ['', "''"]:
+                        # for scratch spaces
+                        user_path = None
+                    if langchain_mode2 in langchain_modes_intrinsic:
+                        user_path = None
+                        textbox = "Invalid access to use internal name: %s" % langchain_mode2
+                        valid = False
+                        langchain_mode2 = langchain_mode1
+                    elif user_path and allow_upload_to_user_data or not user_path and allow_upload_to_my_data:
+                        langchain_mode_paths.update({langchain_mode2: user_path})
+                        if langchain_mode2 not in visible_langchain_modes:
+                            visible_langchain_modes.append(langchain_mode2)
+                        if langchain_mode2 not in langchain_modes:
+                            langchain_modes.append(langchain_mode2)
+                        textbox = ''
+                        if user_path:
+                            makedirs(user_path, exist_ok=True)
+                    else:
+                        valid = False
+                        langchain_mode2 = langchain_mode1
+                        textbox = "Invalid access.  user allowed: %s " \
+                                  "scratch allowed: %s" % (allow_upload_to_user_data, allow_upload_to_my_data)
+                else:
+                    valid = False
+                    langchain_mode2 = langchain_mode1
+                    textbox = "Invalid, collection must be >=3 characters and alphanumeric"
+            else:
+                valid = False
+                langchain_mode2 = langchain_mode1
+                textbox = "Invalid, must be like UserData2, user_path2"
+            selection_docs_state1 = update_langchain_mode_paths(db1s, selection_docs_state1)
+            df_langchain_mode_paths1 = get_df_langchain_mode_paths(selection_docs_state1)
+            choices = get_langchain_choices(selection_docs_state1)
+            if valid and not user_path:
+                # needs to have key for it to make it known different from userdata case in _update_user_db()
+                db1s[langchain_mode2] = [None, None]
+            if valid:
+                save_collection_names(langchain_modes, visible_langchain_modes, langchain_mode_paths, LangChainMode,
+                                      db1s)
+            return db1s, selection_docs_state1, gr.update(choices=choices,
+                                                          value=langchain_mode2), textbox, df_langchain_mode_paths1
+        def remove_langchain_mode(db1s, selection_docs_state1, langchain_mode1, langchain_mode2, dbsu=None):
+            for k in db1s:
+                set_userid(db1s[k])
+            assert dbsu is not None
+            langchain_modes = selection_docs_state1['langchain_modes']
+            langchain_mode_paths = selection_docs_state1['langchain_mode_paths']
+            visible_langchain_modes = selection_docs_state1['visible_langchain_modes']
+            if langchain_mode2 in db1s and not allow_upload_to_my_data or \
+                    dbsu is not None and langchain_mode2 in dbsu and not allow_upload_to_user_data or \
+                    langchain_mode2 in langchain_modes_intrinsic:
+                # NOTE: Doesn't fail if remove MyData, but didn't debug odd behavior seen with upload after gone
+                textbox = "Invalid access, cannot remove %s" % langchain_mode2
+                df_langchain_mode_paths1 = get_df_langchain_mode_paths(selection_docs_state1)
+            else:
+                # change global variables
+                if langchain_mode2 in visible_langchain_modes:
+                    visible_langchain_modes.remove(langchain_mode2)
+                    textbox = ""
+                else:
+                    textbox = "%s was not visible" % langchain_mode2
+                if langchain_mode2 in langchain_modes:
+                    langchain_modes.remove(langchain_mode2)
+                if langchain_mode2 in langchain_mode_paths:
+                    langchain_mode_paths.pop(langchain_mode2)
+                if langchain_mode2 in db1s:
+                    # remove db entirely, so not in list, else need to manage visible list in update_langchain_mode_paths()
+                    # FIXME: Remove location?
+                    if langchain_mode2 != LangChainMode.MY_DATA.value:
+                        # don't remove last MyData, used as user hash
+                        db1s.pop(langchain_mode2)
+                # only show
+                selection_docs_state1 = update_langchain_mode_paths(db1s, selection_docs_state1)
+                df_langchain_mode_paths1 = get_df_langchain_mode_paths(selection_docs_state1)
+                save_collection_names(langchain_modes, visible_langchain_modes, langchain_mode_paths, LangChainMode,
+                                      db1s)
+            return db1s, selection_docs_state1, \
+                gr.update(choices=get_langchain_choices(selection_docs_state1),
+                          value=langchain_mode2), textbox, df_langchain_mode_paths1
+        new_langchain_mode_text.submit(fn=add_langchain_mode,
+                                       inputs=[my_db_state, selection_docs_state, langchain_mode,
+                                               new_langchain_mode_text],
+                                       outputs=[my_db_state, selection_docs_state, langchain_mode,
+                                                new_langchain_mode_text,
+                                                langchain_mode_path_text],
+                                       api_name='new_langchain_mode_text' if allow_api and allow_upload_to_user_data else None)
+        remove_langchain_mode_func = functools.partial(remove_langchain_mode, dbsu=dbs)
+        remove_langchain_mode_text.submit(fn=remove_langchain_mode_func,
+                                          inputs=[my_db_state, selection_docs_state, langchain_mode,
+                                                  remove_langchain_mode_text],
+                                          outputs=[my_db_state, selection_docs_state, langchain_mode,
+                                                   remove_langchain_mode_text,
+                                                   langchain_mode_path_text],
+                                          api_name='remove_langchain_mode_text' if allow_api and allow_upload_to_user_data else None)
+        def update_langchain_gr(db1s, selection_docs_state1, langchain_mode1):
+            for k in db1s:
+                set_userid(db1s[k])
+            langchain_modes = selection_docs_state1['langchain_modes']
+            langchain_mode_paths = selection_docs_state1['langchain_mode_paths']
+            visible_langchain_modes = selection_docs_state1['visible_langchain_modes']
+            # in-place
+            # update user collaborative collections
+            update_langchain(langchain_modes, visible_langchain_modes, langchain_mode_paths, '')
+            # update scratch single-user collections
+            user_hash = db1s.get(LangChainMode.MY_DATA.value, '')[1]
+            update_langchain(langchain_modes, visible_langchain_modes, langchain_mode_paths, user_hash)
+            selection_docs_state1 = update_langchain_mode_paths(db1s, selection_docs_state1)
+            df_langchain_mode_paths1 = get_df_langchain_mode_paths(selection_docs_state1)
+            return selection_docs_state1, \
+                gr.update(choices=get_langchain_choices(selection_docs_state1),
+                          value=langchain_mode1), df_langchain_mode_paths1
+        load_langchain.click(fn=update_langchain_gr,
+                             inputs=[my_db_state, selection_docs_state, langchain_mode],
+                             outputs=[selection_docs_state, langchain_mode, langchain_mode_path_text],
+                             api_name='load_langchain' if allow_api and allow_upload_to_user_data else None)
         inputs_list, inputs_dict = get_inputs_list(all_kwargs, kwargs['model_lower'], model_id=1)
         inputs_list2, inputs_dict2 = get_inputs_list(all_kwargs, kwargs['model_lower'], model_id=2)
         from functools import partial
         def evaluate_nochat(*args1, default_kwargs1=None, str_api=False, **kwargs1):
             args_list = list(args1)
             if str_api:
+                user_kwargs = args_list[len(input_args_list)]
                 assert isinstance(user_kwargs, str)
                 user_kwargs = ast.literal_eval(user_kwargs)
             else:
+                user_kwargs = {k: v for k, v in zip(eval_func_param_names, args_list[len(input_args_list):])}
             # only used for submit_nochat_api
             user_kwargs['chat'] = False
             if 'stream_output' not in user_kwargs:
                 user_kwargs['langchain_mode'] = 'Disabled'
             if 'langchain_action' not in user_kwargs:
                 user_kwargs['langchain_action'] = LangChainAction.QUERY.value
+            if 'langchain_agents' not in user_kwargs:
+                user_kwargs['langchain_agents'] = []
             set1 = set(list(default_kwargs1.keys()))
             set2 = set(eval_func_param_names)
             # correct ordering.  Note some things may not be in default_kwargs, so can't be default of user_kwargs.get()
             model_state1 = args_list[0]
             my_db_state1 = args_list[1]
+            selection_docs_state1 = args_list[2]
             args_list = [user_kwargs[k] if k in user_kwargs and user_kwargs[k] is not None else default_kwargs1[k] for k
                          in eval_func_param_names]
             assert len(args_list) == len(eval_func_param_names)
+            args_list = [model_state1, my_db_state1, selection_docs_state1] + args_list
             try:
                 for res_dict in evaluate(*tuple(args_list), **kwargs1):
             prompt_type1 = args_list[eval_func_param_names.index('prompt_type')]
             langchain_mode1 = args_list[eval_func_param_names.index('langchain_mode')]
             langchain_action1 = args_list[eval_func_param_names.index('langchain_action')]
+            langchain_agents1 = args_list[eval_func_param_names.index('langchain_agents')]
             document_subset1 = args_list[eval_func_param_names.index('document_subset')]
             document_choice1 = args_list[eval_func_param_names.index('document_choice')]
             if not prompt_type1:
                     history[-1][1] = None
                 return history
             if user_message1 in ['', None, '\n']:
+                if not allow_empty_instruction(langchain_mode1, document_subset1, langchain_action1):
                     # reject non-retry submit/enter
                     return history
             user_message1 = fix_text_for_gradio(user_message1)
                  API only called for which_model=0, default for inputs_list, but rest should ignore inputs_list
             :return: last element is True if should run bot, False if should just yield history
             """
+            isize = len(input_args_list) + 1  # states + chat history
             # don't deepcopy, can contain model itself
             args_list = list(args).copy()
+            model_state1 = args_list[-isize]
+            my_db_state1 = args_list[-isize + 1]
+            selection_docs_state1 = args_list[-isize + 2]
             history = args_list[-1]
             prompt_type1 = args_list[eval_func_param_names.index('prompt_type')]
             prompt_dict1 = args_list[eval_func_param_names.index('prompt_dict')]
             if model_state1['model'] is None or model_state1['model'] == no_model_str:
                 return history, None, None, None
+            args_list = args_list[:-isize]  # only keep rest needed for evaluate()
             langchain_mode1 = args_list[eval_func_param_names.index('langchain_mode')]
+            add_chat_history_to_context1 = args_list[eval_func_param_names.index('add_chat_history_to_context')]
             langchain_action1 = args_list[eval_func_param_names.index('langchain_action')]
+            langchain_agents1 = args_list[eval_func_param_names.index('langchain_agents')]
             document_subset1 = args_list[eval_func_param_names.index('document_subset')]
             document_choice1 = args_list[eval_func_param_names.index('document_choice')]
             if not history:
                 instruction1 = history[-1][0]
                 history[-1][1] = None
             elif not instruction1:
+                if not allow_empty_instruction(langchain_mode1, document_subset1, langchain_action1):
                     # if not retrying, then reject empty query
                     return history, None, None, None
             elif len(history) > 0 and history[-1][1] not in [None, '']:
             chat1 = args_list[eval_func_param_names.index('chat')]
             model_max_length1 = get_model_max_length(model_state1)
+            context1 = history_to_context(history, langchain_mode1,
+                                          add_chat_history_to_context1,
+                                          prompt_type1, prompt_dict1, chat1,
                                           model_max_length1, memory_restriction_level,
                                           kwargs['keep_sources_in_context'])
             args_list[0] = instruction1  # override original instruction with history from user
             fun1 = partial(evaluate,
                            model_state1,
                            my_db_state1,
+                           selection_docs_state1,
                            *tuple(args_list),
                            **kwargs_evaluate)
                 clear_torch_cache()
             return
+        def clear_embeddings(langchain_mode1, db1s):
             # clear any use of embedding that sits on GPU, else keeps accumulating GPU usage even if clear torch cache
+            if db_type == 'chroma' and langchain_mode1 not in ['LLM', 'Disabled', None, '']:
                 from gpt_langchain import clear_embedding
                 db = dbs.get('langchain_mode1')
                 if db is not None and not isinstance(db, str):
                     clear_embedding(db)
+                if db1s is not None and langchain_mode1 in db1s:
+                    db1 = db1s[langchain_mode1]
+                    if len(db1) == 2:
+                        clear_embedding(db1[0])
         def bot(*args, retry=False):
+            history, fun1, langchain_mode1, db1 = prep_bot(*args, retry=retry)
             try:
                 for res in get_response(fun1, history):
                     yield res
             finally:
                 clear_torch_cache()
+                clear_embeddings(langchain_mode1, db1)
         def all_bot(*args, retry=False, model_states1=None):
             args_list = list(args).copy()
             stream_output1 = args_list[eval_func_param_names.index('stream_output')]
             max_time1 = args_list[eval_func_param_names.index('max_time')]
             langchain_mode1 = args_list[eval_func_param_names.index('langchain_mode')]
+            isize = len(input_args_list) + 1  # states + chat history
+            db1s = None
             try:
                 gen_list = []
                 for chatboti, (chatbot1, model_state1) in enumerate(zip(chatbots, model_states1)):
                     args_list1 = args_list0.copy()
+                    args_list1.insert(-isize + 2,
+                                      model_state1)  # insert at -2 so is at -3, and after chatbot1 added, at -4
                     # if at start, have None in response still, replace with '' so client etc. acts like normal
                     # assumes other parts of code treat '' and None as if no response yet from bot
                     # can't do this later in bot code as racy with threaded generators
                     # so consistent with prep_bot()
                     # with model_state1 at -3, my_db_state1 at -2, and history(chatbot) at -1
                     # langchain_mode1 and my_db_state1 should be same for every bot
+                    history, fun1, langchain_mode1, db1s = prep_bot(*tuple(args_list1), retry=retry,
+                                                                    which_model=chatboti)
                     gen1 = get_response(fun1, history)
                     if stream_output1:
                         gen1 = TimeoutIterator(gen1, timeout=0.01, sentinel=None, raise_on_exception=False)
                         print("Generate exceptions: %s" % exceptions, flush=True)
             finally:
                 clear_torch_cache()
+                clear_embeddings(langchain_mode1, db1s)
         # NORMAL MODEL
         user_args = dict(fn=functools.partial(user, sanitize_user_prompt=kwargs['sanitize_user_prompt']),
                          outputs=text_output,
                          )
         bot_args = dict(fn=bot,
+                        inputs=inputs_list + [model_state, my_db_state, selection_docs_state] + [text_output],
                         outputs=[text_output, chat_exception_text],
                         )
         retry_bot_args = dict(fn=functools.partial(bot, retry=True),
+                              inputs=inputs_list + [model_state, my_db_state, selection_docs_state] + [text_output],
                               outputs=[text_output, chat_exception_text],
                               )
         retry_user_args = dict(fn=functools.partial(user, retry=True),
                           outputs=text_output2,
                           )
         bot_args2 = dict(fn=bot,
+                         inputs=inputs_list2 + [model_state2, my_db_state, selection_docs_state] + [text_output2],
                          outputs=[text_output2, chat_exception_text],
                          )
         retry_bot_args2 = dict(fn=functools.partial(bot, retry=True),
+                               inputs=inputs_list2 + [model_state2, my_db_state, selection_docs_state] + [text_output2],
                                outputs=[text_output2, chat_exception_text],
                                )
         retry_user_args2 = dict(fn=functools.partial(user, retry=True),
                              outputs=text_outputs,
                              )
         all_bot_args = dict(fn=functools.partial(all_bot, model_states1=model_states),
+                            inputs=inputs_list + [my_db_state, selection_docs_state] + text_outputs,
                             outputs=text_outputs + [chat_exception_text],
                             )
         all_retry_bot_args = dict(fn=functools.partial(all_bot, model_states1=model_states, retry=True),
+                                  inputs=inputs_list + [my_db_state, selection_docs_state] + text_outputs,
                                   outputs=text_outputs + [chat_exception_text],
                                   )
         all_retry_user_args = dict(fn=functools.partial(all_user, retry=True,
         def get_short_chat(x, short_chats, short_len=20, words=4):
             if x and len(x[0]) == 2 and x[0][0] is not None:
                 short_chat = ' '.join(x[0][0][:short_len].split(' ')[:words]).strip()
+                if not short_chat:
+                    # e.g.summarization, try using answer
+                    short_chat = ' '.join(x[0][1][:short_len].split(' ')[:words]).strip()
+                    if not short_chat:
+                        short_chat = 'Unk'
                 short_chat = dedup(short_chat, short_chats)
             else:
                 short_chat = None
                     already_exists = any([is_chat_same(chat_list, x) for x in old_chat_lists])
                     if not already_exists:
                         chat_state1[short_chat] = chat_list.copy()
+            # reverse so newest at top
+            choices = list(chat_state1.keys()).copy()
+            choices.reverse()
+            return chat_state1, gr.update(choices=choices, value=None)
         def switch_chat(chat_key, chat_state1, num_model_lock=0):
             chosen_chat = chat_state1[chat_key]
         remove_chat_event = remove_chat_btn.click(remove_chat,
                                                   inputs=[radio_chats, chat_state], outputs=[radio_chats, chat_state],
+                                                  queue=False, api_name='remove_chat')
         def get_chats1(chat_state1):
             base = 'chats'
                         new_chats = json.loads(f.read())
                         for chat1_k, chat1_v in new_chats.items():
                             # ignore chat1_k, regenerate and de-dup to avoid loss
+                            chat_state1, _ = save_chat(chat1_v, chat_state1, chat_is_list=True)
                 except BaseException as e:
                     t, v, tb = sys.exc_info()
                     ex = ''.join(traceback.format_exception(t, v, tb))
             .then(deselect_radio_chats, inputs=None, outputs=radio_chats, queue=False) \
             .then(clear_scores, outputs=[score_text, score_text2, score_text_nochat])
         clear_event = save_chat_btn.click(save_chat,
                                           inputs=[text_output, text_output2] + text_outputs + [chat_state],
+                                          outputs=[chat_state, radio_chats],
+                                          api_name='save_chat' if allow_api else None)
+        if kwargs['score_model']:
+            clear_event2 = clear_event.then(clear_scores, outputs=[score_text, score_text2, score_text_nochat])
         # NOTE: clear of instruction/iinput for nochat has to come after score,
         # because score for nochat consumes actual textbox, while chat consumes chat history filled by user()
         no_chat_args = dict(fn=fun,
+                            inputs=[model_state, my_db_state, selection_docs_state] + inputs_list,
                             outputs=text_output_nochat,
                             queue=queue,
                             )
             .then(clear_torch_cache)
         submit_event_nochat_api = submit_nochat_api.click(fun_with_dict_str,
+                                                          inputs=[model_state, my_db_state, selection_docs_state,
+                                                                  inputs_dict_str],
                                                           outputs=text_output_nochat_api,
                                                           queue=True,  # required for generator
                                                           api_name='submit_nochat_api' if allow_api else None) \
                 print("Exception: %s" % str(e), flush=True)
             return json.dumps(sys_dict)
+        system_kwargs = all_kwargs.copy()
+        system_kwargs.update(dict(command=str(' '.join(sys.argv))))
         get_system_info_dict_func = functools.partial(get_system_info_dict, **all_kwargs)
         system_dict_event = system_btn2.click(get_system_info_dict_func,
             else:
                 tokenizer = None
             if tokenizer is not None:
+                langchain_mode1 = 'LLM'
+                add_chat_history_to_context1 = True
                 # fake user message to mimic bot()
                 chat1 = copy.deepcopy(chat1)
                 chat1 = chat1 + [['user_message1', None]]
                 model_max_length1 = tokenizer.model_max_length
+                context1 = history_to_context(chat1, langchain_mode1,
+                                              add_chat_history_to_context1,
+                                              prompt_type1, prompt_dict1, chat1,
                                               model_max_length1,
                                               memory_restriction_level1, keep_sources_in_context1)
                 return str(tokenizer(context1, return_tensors="pt")['input_ids'].shape[1])
                        ,
                        queue=False, api_name='stop' if allow_api else None).then(clear_torch_cache, queue=False)
+        demo.load(None, None, None, _js=get_dark_js() if kwargs['dark'] else None)
     demo.queue(concurrency_count=kwargs['concurrency_count'], api_open=kwargs['api_open'])
     favicon_path = "h2o-logo.svg"
         # FIXME: disable for gptj, langchain or gpt4all modify print itself
         # FIXME: and any multi-threaded/async print will enter model output!
         scheduler.add_job(func=ping, trigger="interval", seconds=60)
+    if is_public or os.getenv('PING_GPU'):
+        scheduler.add_job(func=ping_gpu, trigger="interval", seconds=60 * 10)
     scheduler.start()
     # import control
         demo.block_thread()
 def get_inputs_list(inputs_dict, model_lower, model_id=1):
     """
     map gradio objects in locals() to inputs for evaluate().
     return inputs_list, inputs_dict_out
+def get_sources(db1s, langchain_mode, dbs=None, docs_state0=None):
+    for k in db1s:
+        set_userid(db1s[k])
     if langchain_mode in ['ChatLLM', 'LLM']:
         source_files_added = "NA"
         source_files_added = "Not showing wiki_full, takes about 20 seconds and makes 4MB file." \
                              "  Ask jon.mckinney@h2o.ai for file if required."
         source_list = []
+    elif langchain_mode in db1s and len(db1s[langchain_mode]) == 2 and db1s[langchain_mode][0] is not None:
+        db1 = db1s[langchain_mode]
         from gpt_langchain import get_metadatas
         metadatas = get_metadatas(db1[0])
         source_list = sorted(set([x['source'] for x in metadatas]))
         db1[1] = str(uuid.uuid4())
+def update_user_db(file, db1s, selection_docs_state1, chunk, chunk_size, langchain_mode, dbs=None, **kwargs):
+    kwargs.update(selection_docs_state1)
     if file is None:
         raise RuntimeError("Don't use change, use input")
     try:
+        return _update_user_db(file, db1s=db1s, chunk=chunk, chunk_size=chunk_size,
                                langchain_mode=langchain_mode, dbs=dbs,
                                **kwargs)
     except BaseException as e:
     user_id = db1[1]
     base_path = 'locks'
     makedirs(base_path)
+    lock_file = os.path.join(base_path, "db_%s_%s.lock" % (langchain_mode.replace(' ', '_'), user_id))
     return lock_file
 def _update_user_db(file,
+                    db1s=None,
                     chunk=None, chunk_size=None,
+                    dbs=None, db_type=None,
+                    langchain_mode='UserData',
+                    langchain_modes=None,  # unused but required as part of selection_docs_state1
+                    langchain_mode_paths=None,
+                    visible_langchain_modes=None,
                     use_openai_embedding=None,
                     hf_embedding_model=None,
                     caption_loader=None,
                     enable_captions=None,
                     captions_model=None,
                     enable_ocr=None,
+                    enable_pdf_ocr=None,
                     verbose=None,
+                    n_jobs=-1,
                     is_url=None, is_txt=None,
+                    ):
+    assert db1s is not None
     assert chunk is not None
     assert chunk_size is not None
     assert use_openai_embedding is not None
     assert enable_captions is not None
     assert captions_model is not None
     assert enable_ocr is not None
+    assert enable_pdf_ocr is not None
     assert verbose is not None
     if dbs is None:
         dbs = {}
     assert isinstance(dbs, dict), "Wrong type for dbs: %s" % str(type(dbs))
     if langchain_mode == LangChainMode.DISABLED.value:
         return None, langchain_mode, get_source_files(), ""
+    if langchain_mode in [LangChainMode.LLM.value]:
         # then switch to MyData, so langchain_mode also becomes way to select where upload goes
         # but default to mydata if nothing chosen, since safest
+        if LangChainMode.MY_DATA.value in visible_langchain_modes:
+            langchain_mode = LangChainMode.MY_DATA.value
+    if langchain_mode_paths is None:
+        langchain_mode_paths = {}
+    user_path = langchain_mode_paths.get(langchain_mode)
+    # UserData or custom, which has to be from user's disk
+    if user_path is not None:
         # move temp files from gradio upload to stable location
         for fili, fil in enumerate(file):
+            if isinstance(fil, str) and os.path.isfile(fil):  # not url, text
+                new_fil = os.path.normpath(os.path.join(user_path, os.path.basename(fil)))
+                if os.path.normpath(os.path.abspath(fil)) != os.path.normpath(os.path.abspath(new_fil)):
                     if os.path.isfile(new_fil):
                         remove(new_fil)
                     try:
                            enable_captions=enable_captions,
                            captions_model=captions_model,
                            enable_ocr=enable_ocr,
+                           enable_pdf_ocr=enable_pdf_ocr,
                            caption_loader=caption_loader,
                            )
     exceptions = [x for x in sources if x.metadata.get('exception')]
     exceptions_strs = [x.metadata['exception'] for x in exceptions]
     sources = [x for x in sources if 'exception' not in x.metadata]
+    # below must at least come after langchain_mode is modified in case was LLM -> MyData,
+    # so original langchain mode changed
+    for k in db1s:
+        set_userid(db1s[k])
+    db1 = get_db1(db1s, langchain_mode)
+    lock_file = get_lock_file(db1s[LangChainMode.MY_DATA.value], langchain_mode)  # user-level lock, not db-level lock
     with filelock.FileLock(lock_file):
+        if langchain_mode in db1s:
             if db1[0] is not None:
                 # then add
                 db, num_new_sources, new_sources_metadata = add_to_db(db1[0], sources, db_type=db_type,
                 # in testing expect:
                 # assert len(db1) == 2 and db1[1] is None, "Bad MyData db: %s" % db1
                 # for production hit, when user gets clicky:
+                assert len(db1) == 2, "Bad %s db: %s" % (langchain_mode, db1)
+                assert db1[1] is not None, "db hash was None, not allowed"
                 # then create
                 # if added has to original state and didn't change, then would be shared db for all users
                 persist_directory = os.path.join(scratch_base_dir, 'db_dir_%s_%s' % (langchain_mode, db1[1]))
                                                                       use_openai_embedding=use_openai_embedding,
                                                                       hf_embedding_model=hf_embedding_model)
             else:
+                # then create.  Or might just be that dbs is unfilled, then it will fill, then add
                 db = get_db(sources, use_openai_embedding=use_openai_embedding,
                             db_type=db_type,
                             persist_directory=persist_directory,
             return None, langchain_mode, source_files_added, '\n'.join(exceptions_strs)
+def get_db(db1s, langchain_mode, dbs=None):
+    db1 = get_db1(db1s, langchain_mode)
+    lock_file = get_lock_file(db1s[LangChainMode.MY_DATA.value], langchain_mode)
     with filelock.FileLock(lock_file):
         if langchain_mode in ['wiki_full']:
             # NOTE: avoid showing full wiki.  Takes about 30 seconds over about 90k entries, but not useful for now
             db = None
+        elif langchain_mode in db1s and len(db1) == 2 and db1[0] is not None:
             db = db1[0]
         elif dbs is not None and langchain_mode in dbs and dbs[langchain_mode] is not None:
             db = dbs[langchain_mode]
     return db
+def get_source_files_given_langchain_mode(db1s, langchain_mode='UserData', dbs=None):
+    db = get_db(db1s, langchain_mode, dbs=dbs)
     if langchain_mode in ['ChatLLM', 'LLM'] or db is None:
         return "Sources: N/A"
     return get_source_files(db=db, exceptions=None)
     return source_files_added
+def update_and_get_source_files_given_langchain_mode(db1s, langchain_mode, chunk, chunk_size,
+                                                     dbs=None, first_para=None,
+                                                     text_limit=None,
+                                                     langchain_mode_paths=None, db_type=None, load_db_if_exists=None,
                                                      n_jobs=None, verbose=None):
+    has_path = {k: v for k, v in langchain_mode_paths.items() if v}
+    if langchain_mode in [LangChainMode.LLM.value, LangChainMode.MY_DATA.value]:
+        # then assume user really meant UserData, to avoid extra clicks in UI,
+        # since others can't be on disk, except custom user modes, which they should then select to query it
+        if LangChainMode.USER_DATA.value in has_path:
+            langchain_mode = LangChainMode.USER_DATA.value
+    db = get_db(db1s, langchain_mode, dbs=dbs)
     from gpt_langchain import make_db
     db, num_new_sources, new_sources_metadata = make_db(use_openai_embedding=False,
                                                         chunk=chunk,
                                                         chunk_size=chunk_size,
                                                         langchain_mode=langchain_mode,
+                                                        langchain_mode_paths=langchain_mode_paths,
                                                         db_type=db_type,
                                                         load_db_if_exists=load_db_if_exists,
                                                         db=db,
                                                         n_jobs=n_jobs,
                                                         verbose=verbose)
+    # during refreshing, might have "created" new db since not in dbs[] yet, so insert back just in case
+    # so even if persisted, not kept up-to-date with dbs memory
+    if langchain_mode in db1s:
+        db1s[langchain_mode][0] = db
+    else:
+        dbs[langchain_mode] = db
     # return only new sources with text saying such
     return get_source_files(db=None, exceptions=None, metadatas=new_sources_metadata)
+def get_db1(db1s, langchain_mode1):
+    if langchain_mode1 in db1s:
+        db1 = db1s[langchain_mode1]
+    else:
+        # indicates to code that not scratch database
+        db1 = [None, None]
+    return db1

gradio_utils/__init__.py ADDED Viewed

File without changes

gradio_utils/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (134 Bytes). View file

gradio_utils/__pycache__/css.cpython-310.pyc CHANGED Viewed

Binary files a/gradio_utils/__pycache__/css.cpython-310.pyc and b/gradio_utils/__pycache__/css.cpython-310.pyc differ

gradio_utils/css.py CHANGED Viewed

@@ -53,4 +53,8 @@ def make_css_base() -> str:
         margin-bottom: 2.5rem;
     }
     .chatsmall chatbot {font-size: 10px !important}
     """

         margin-bottom: 2.5rem;
     }
     .chatsmall chatbot {font-size: 10px !important}
+    .gradio-container {
+        max-width: none !important;
+    }
     """

h2oai_pipeline.py CHANGED Viewed

@@ -11,6 +11,7 @@ class H2OTextGenerationPipeline(TextGenerationPipeline):
     def __init__(self, *args, debug=False, chat=False, stream_output=False,
                  sanitize_bot_response=False,
                  use_prompter=True, prompter=None,
                  prompt_type=None, prompt_dict=None,
                  max_input_tokens=2048 - 256, **kwargs):
         """
@@ -34,6 +35,8 @@ class H2OTextGenerationPipeline(TextGenerationPipeline):
         self.prompt_type = prompt_type
         self.prompt_dict = prompt_dict
         self.prompter = prompter
         if self.use_prompter:
             if self.prompter is not None:
                 assert self.prompter.prompt_type is not None
@@ -113,7 +116,7 @@ class H2OTextGenerationPipeline(TextGenerationPipeline):
     def preprocess(self, prompt_text, prefix="", handle_long_generation=None, **generate_kwargs):
         prompt_text, num_prompt_tokens = H2OTextGenerationPipeline.limit_prompt(prompt_text, self.tokenizer)
-        data_point = dict(context='', instruction=prompt_text, input='')
         if self.prompter is not None:
             prompt_text = self.prompter.generate_prompt(data_point)
         self.prompt_text = prompt_text

     def __init__(self, *args, debug=False, chat=False, stream_output=False,
                  sanitize_bot_response=False,
                  use_prompter=True, prompter=None,
+                 context='', iinput='',
                  prompt_type=None, prompt_dict=None,
                  max_input_tokens=2048 - 256, **kwargs):
         """
         self.prompt_type = prompt_type
         self.prompt_dict = prompt_dict
         self.prompter = prompter
+        self.context = context
+        self.iinput = iinput
         if self.use_prompter:
             if self.prompter is not None:
                 assert self.prompter.prompt_type is not None
     def preprocess(self, prompt_text, prefix="", handle_long_generation=None, **generate_kwargs):
         prompt_text, num_prompt_tokens = H2OTextGenerationPipeline.limit_prompt(prompt_text, self.tokenizer)
+        data_point = dict(context=self.context, instruction=prompt_text, input=self.iinput)
         if self.prompter is not None:
             prompt_text = self.prompter.generate_prompt(data_point)
         self.prompt_text = prompt_text

iterators/__pycache__/timeout_iterator.cpython-310.pyc CHANGED Viewed

Binary files a/iterators/__pycache__/timeout_iterator.cpython-310.pyc and b/iterators/__pycache__/timeout_iterator.cpython-310.pyc differ

iterators/timeout_iterator.py CHANGED Viewed

@@ -48,7 +48,7 @@ class TimeoutIterator:
     def interrupt(self):
         """
         interrupt and stop the underlying thread.
-        the thread acutally dies only after interrupt has been set and
         the underlying iterator yields a value after that.
         """
         self._interrupt = True

     def interrupt(self):
         """
         interrupt and stop the underlying thread.
+        the thread actually dies only after interrupt has been set and
         the underlying iterator yields a value after that.
         """
         self._interrupt = True

prompter.py CHANGED Viewed

@@ -77,6 +77,12 @@ prompt_type_to_model_name = {
     "mptchat": ['mosaicml/mpt-7b-chat', 'mosaicml/mpt-30b-chat', 'TheBloke/mpt-30B-chat-GGML'],
     "vicuna11": ['lmsys/vicuna-33b-v1.3'],
     "falcon": ['tiiuae/falcon-40b-instruct', 'tiiuae/falcon-40b', 'tiiuae/falcon-7b-instruct', 'tiiuae/falcon-7b'],
     # could be plain, but default is correct prompt_type for default TheBloke model ggml-wizardLM-7B.q4_2.bin
 }
 if os.getenv('OPENAI_API_KEY'):
@@ -582,6 +588,42 @@ ASSISTANT:
             # if add space here, non-unique tokenization will often make LLM produce wrong output
             PreResponse = PreResponse
         # generates_leading_space = True
     else:
         raise RuntimeError("No such prompt_type=%s" % prompt_type)
@@ -810,9 +852,20 @@ class Prompter(object):
                 if oi > 0:
                     # post fix outputs with seperator
                     output += '\n'
             outputs[oi] = output
         # join all outputs, only one extra new line between outputs
         output = '\n'.join(outputs)
         if self.debug:
             print("outputclean:\n%s" % '\n\n'.join(outputs), flush=True)
         return output

     "mptchat": ['mosaicml/mpt-7b-chat', 'mosaicml/mpt-30b-chat', 'TheBloke/mpt-30B-chat-GGML'],
     "vicuna11": ['lmsys/vicuna-33b-v1.3'],
     "falcon": ['tiiuae/falcon-40b-instruct', 'tiiuae/falcon-40b', 'tiiuae/falcon-7b-instruct', 'tiiuae/falcon-7b'],
+    "llama2": [
+        'meta-llama/Llama-2-7b-chat-hf',
+        'meta-llama/Llama-2-13b-chat-hf',
+        'meta-llama/Llama-2-34b-chat-hf',
+        'meta-llama/Llama-2-70b-chat-hf',
+    ],
     # could be plain, but default is correct prompt_type for default TheBloke model ggml-wizardLM-7B.q4_2.bin
 }
 if os.getenv('OPENAI_API_KEY'):
             # if add space here, non-unique tokenization will often make LLM produce wrong output
             PreResponse = PreResponse
         # generates_leading_space = True
+    elif prompt_type in [PromptType.guanaco.value, str(PromptType.guanaco.value),
+                         PromptType.guanaco.name]:
+        # https://huggingface.co/TheBloke/guanaco-65B-GPTQ
+        promptA = promptB = "" if not (chat and reduced) else ''
+        PreInstruct = """### Human: """
+        PreInput = None
+        PreResponse = """### Assistant:"""
+        terminate_response = ['### Human:']  # but only allow terminate after prompt is found correctly, else can't terminate
+        chat_turn_sep = chat_sep = '\n'
+        humanstr = PreInstruct
+        botstr = PreResponse
+    elif prompt_type in [PromptType.llama2.value, str(PromptType.llama2.value),
+                         PromptType.llama2.name]:
+        PreInstruct = ""
+        llama2_sys = "<<SYS>>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n<</SYS>>\n\n"
+        prompt = "<s>[INST] "
+        enable_sys = False  # too much safety, hurts accuracy
+        if not (chat and reduced):
+            if enable_sys:
+                promptA = promptB = prompt + llama2_sys
+            else:
+                promptA = promptB = prompt
+        else:
+            promptA = promptB = ''
+        PreInput = None
+        PreResponse = ""
+        terminate_response = ["[INST]", "</s>"]
+        chat_sep = ' [/INST]'
+        chat_turn_sep = ' </s><s>[INST] '
+        humanstr = PreInstruct
+        botstr = PreResponse
+        if making_context:
+            PreResponse += " "
     else:
         raise RuntimeError("No such prompt_type=%s" % prompt_type)
                 if oi > 0:
                     # post fix outputs with seperator
                     output += '\n'
+            output = self.fix_text(self.prompt_type, output)
             outputs[oi] = output
         # join all outputs, only one extra new line between outputs
         output = '\n'.join(outputs)
         if self.debug:
             print("outputclean:\n%s" % '\n\n'.join(outputs), flush=True)
         return output
+    @staticmethod
+    def fix_text(prompt_type1, text1):
+        if prompt_type1 == 'human_bot':
+            # hack bug in vLLM with stopping, stops right, but doesn't return last token
+            hfix = '<human'
+            if text1.endswith(hfix):
+                text1 = text1[:-len(hfix)]
+        return text1

requirements.txt CHANGED Viewed

@@ -1,8 +1,8 @@
 # for generate (gradio server) and finetune
 datasets==2.13.0
 sentencepiece==0.1.99
-gradio==3.35.2
-huggingface_hub==0.15.1
 appdirs==1.4.4
 fire==0.5.0
 docutils==0.20.1
@@ -19,7 +19,7 @@ matplotlib==3.7.1
 loralib==0.1.1
 bitsandbytes==0.39.0
 accelerate==0.20.3
-git+https://github.com/huggingface/peft.git@06fd06a4d2e8ed8c3a253c67d9c3cb23e0f497ad
 transformers==4.30.2
 tokenizers==0.13.3
 APScheduler==3.10.1
@@ -35,7 +35,7 @@ tensorboard==2.13.0
 neptune==1.2.0
 # for gradio client
-gradio_client==0.2.7
 beautifulsoup4==4.12.2
 markdown==3.4.3
@@ -64,8 +64,8 @@ tiktoken==0.4.0
 # optional: for OpenAI endpoint or embeddings (requires key)
 openai==0.27.8
 # optional for chat with PDF
-langchain==0.0.202
-pypdf==3.9.1
 # avoid textract, requires old six
 #textract==1.6.5
@@ -78,10 +78,10 @@ chromadb==0.3.25
 #pymilvus==2.2.8
 # weak url support, if can't install opencv etc. If comment-in this one, then comment-out unstructured[local-inference]==0.6.6
-# unstructured==0.6.6
 # strong support for images
-# Requires on Ubuntu: sudo apt-get install libmagic-dev poppler-utils tesseract-ocr libreoffice
 unstructured[local-inference]==0.7.4
 #pdf2image==1.16.3
 #pytesseract==0.3.10
@@ -104,10 +104,10 @@ tabulate==0.9.0
 pip-licenses==4.3.0
 # weaviate vector db
-weaviate-client==3.20.0
 # optional for chat with PDF
-langchain==0.0.202
-pypdf==3.9.1
 # avoid textract, requires old six
 #textract==1.6.5
@@ -120,10 +120,10 @@ chromadb==0.3.25
 #pymilvus==2.2.8
 # weak url support, if can't install opencv etc. If comment-in this one, then comment-out unstructured[local-inference]==0.6.6
-# unstructured==0.6.6
 # strong support for images
-# Requires on Ubuntu: sudo apt-get install libmagic-dev poppler-utils tesseract-ocr libreoffice
 unstructured[local-inference]==0.7.4
 #pdf2image==1.16.3
 #pytesseract==0.3.10
@@ -146,8 +146,8 @@ tabulate==0.9.0
 pip-licenses==4.3.0
 # weaviate vector db
-weaviate-client==3.20.0
 faiss-gpu==1.7.2
-arxiv==1.4.7
-pymupdf==1.22.3 # AGPL license
 # extract-msg==0.41.1  # GPL3

 # for generate (gradio server) and finetune
 datasets==2.13.0
 sentencepiece==0.1.99
+gradio==3.37.0
+huggingface_hub==0.16.4
 appdirs==1.4.4
 fire==0.5.0
 docutils==0.20.1
 loralib==0.1.1
 bitsandbytes==0.39.0
 accelerate==0.20.3
+peft==0.4.0
 transformers==4.30.2
 tokenizers==0.13.3
 APScheduler==3.10.1
 neptune==1.2.0
 # for gradio client
+gradio_client==0.2.10
 beautifulsoup4==4.12.2
 markdown==3.4.3
 # optional: for OpenAI endpoint or embeddings (requires key)
 openai==0.27.8
 # optional for chat with PDF
+langchain==0.0.235
+pypdf==3.12.2
 # avoid textract, requires old six
 #textract==1.6.5
 #pymilvus==2.2.8
 # weak url support, if can't install opencv etc. If comment-in this one, then comment-out unstructured[local-inference]==0.6.6
+# unstructured==0.8.1
 # strong support for images
+# Requires on Ubuntu: sudo apt-get install libmagic-dev poppler-utils tesseract-ocr libtesseract-dev libreoffice
 unstructured[local-inference]==0.7.4
 #pdf2image==1.16.3
 #pytesseract==0.3.10
 pip-licenses==4.3.0
 # weaviate vector db
+weaviate-client==3.22.1
 # optional for chat with PDF
+langchain==0.0.235
+pypdf==3.12.2
 # avoid textract, requires old six
 #textract==1.6.5
 #pymilvus==2.2.8
 # weak url support, if can't install opencv etc. If comment-in this one, then comment-out unstructured[local-inference]==0.6.6
+# unstructured==0.8.1
 # strong support for images
+# Requires on Ubuntu: sudo apt-get install libmagic-dev poppler-utils tesseract-ocr libtesseract-dev libreoffice
 unstructured[local-inference]==0.7.4
 #pdf2image==1.16.3
 #pytesseract==0.3.10
 pip-licenses==4.3.0
 # weaviate vector db
+weaviate-client==3.22.1
 faiss-gpu==1.7.2
+arxiv==1.4.8
+pymupdf==1.22.5 # AGPL license
 # extract-msg==0.41.1  # GPL3

utils.py CHANGED Viewed

@@ -5,6 +5,7 @@ import inspect
 import os
 import gc
 import pathlib
 import random
 import shutil
 import subprocess
@@ -111,12 +112,15 @@ def system_info():
     system = {}
     # https://stackoverflow.com/questions/48951136/plot-multiple-graphs-in-one-plot-using-tensorboard
     # https://arshren.medium.com/monitoring-your-devices-in-python-5191d672f749
-    temps = psutil.sensors_temperatures(fahrenheit=False)
-    if 'coretemp' in temps:
-        coretemp = temps['coretemp']
-        temp_dict = {k.label: k.current for k in coretemp}
-        for k, v in temp_dict.items():
-            system['CPU_C/%s' % k] = v
     # https://github.com/gpuopenanalytics/pynvml/blob/master/help_query_gpu.txt
     try:
@@ -779,6 +783,9 @@ def _traced_func(func, *args, **kwargs):
 def call_subprocess_onetask(func, args=None, kwargs=None):
     if isinstance(args, list):
         args = tuple(args)
     if args is None:
@@ -950,7 +957,6 @@ try:
 except (pkg_resources.DistributionNotFound, AssertionError):
     have_langchain = False
 import distutils.spawn
 have_tesseract = distutils.spawn.find_executable("tesseract")
@@ -985,3 +991,90 @@ except (pkg_resources.DistributionNotFound, AssertionError):
 # disable, hangs too often
 have_playwright = False

 import os
 import gc
 import pathlib
+import pickle
 import random
 import shutil
 import subprocess
     system = {}
     # https://stackoverflow.com/questions/48951136/plot-multiple-graphs-in-one-plot-using-tensorboard
     # https://arshren.medium.com/monitoring-your-devices-in-python-5191d672f749
+    try:
+        temps = psutil.sensors_temperatures(fahrenheit=False)
+        if 'coretemp' in temps:
+            coretemp = temps['coretemp']
+            temp_dict = {k.label: k.current for k in coretemp}
+            for k, v in temp_dict.items():
+                system['CPU_C/%s' % k] = v
+    except AttributeError:
+        pass
     # https://github.com/gpuopenanalytics/pynvml/blob/master/help_query_gpu.txt
     try:
 def call_subprocess_onetask(func, args=None, kwargs=None):
+    import platform
+    if platform.system() in ['Darwin', 'Windows']:
+        return func(*args, **kwargs)
     if isinstance(args, list):
         args = tuple(args)
     if args is None:
 except (pkg_resources.DistributionNotFound, AssertionError):
     have_langchain = False
 import distutils.spawn
 have_tesseract = distutils.spawn.find_executable("tesseract")
 # disable, hangs too often
 have_playwright = False
+def set_openai(inference_server):
+    if inference_server.startswith('vllm'):
+        import openai_vllm
+        openai_vllm.api_key = "EMPTY"
+        inf_type = inference_server.split(':')[0]
+        ip_vllm = inference_server.split(':')[1]
+        port_vllm = inference_server.split(':')[2]
+        openai_vllm.api_base = f"http://{ip_vllm}:{port_vllm}/v1"
+        return openai_vllm, inf_type
+    else:
+        import openai
+        openai.api_key = os.getenv("OPENAI_API_KEY")
+        openai.api_base = os.environ.get("OPENAI_API_BASE", "https://api.openai.com/v1")
+        inf_type = inference_server
+        return openai, inf_type
+visible_langchain_modes_file = 'visible_langchain_modes.pkl'
+def save_collection_names(langchain_modes, visible_langchain_modes, langchain_mode_paths, LangChainMode, db1s):
+    """
+    extra controls if UserData type of MyData type
+    """
+    # use first default MyData hash as general user hash to maintain file
+    # if user moves MyData from langchain modes, db will still survive, so can still use hash
+    scratch_collection_names = list(db1s.keys())
+    user_hash = db1s.get(LangChainMode.MY_DATA.value, '')[1]
+    llms = ['ChatLLM', 'LLM', 'Disabled']
+    scratch_langchain_modes = [x for x in langchain_modes if x in scratch_collection_names]
+    scratch_visible_langchain_modes = [x for x in visible_langchain_modes if x in scratch_collection_names]
+    scratch_langchain_mode_paths = {k: v for k, v in langchain_mode_paths.items() if
+                                    k in scratch_collection_names and k not in llms}
+    user_langchain_modes = [x for x in langchain_modes if x not in scratch_collection_names]
+    user_visible_langchain_modes = [x for x in visible_langchain_modes if x not in scratch_collection_names]
+    user_langchain_mode_paths = {k: v for k, v in langchain_mode_paths.items() if
+                                 k not in scratch_collection_names and k not in llms}
+    base_path = 'locks'
+    makedirs(base_path)
+    # user
+    extra = ''
+    file = "%s%s" % (visible_langchain_modes_file, extra)
+    with filelock.FileLock(os.path.join(base_path, "%s.lock" % file)):
+        with open(file, 'wb') as f:
+            pickle.dump((user_langchain_modes, user_visible_langchain_modes, user_langchain_mode_paths), f)
+    # scratch
+    extra = user_hash
+    file = "%s%s" % (visible_langchain_modes_file, extra)
+    with filelock.FileLock(os.path.join(base_path, "%s.lock" % file)):
+        with open(file, 'wb') as f:
+            pickle.dump((scratch_langchain_modes, scratch_visible_langchain_modes, scratch_langchain_mode_paths), f)
+def load_collection_enum(extra):
+    """
+    extra controls if UserData type of MyData type
+    """
+    file = "%s%s" % (visible_langchain_modes_file, extra)
+    langchain_modes_from_file = []
+    visible_langchain_modes_from_file = []
+    langchain_mode_paths_from_file = {}
+    if os.path.isfile(visible_langchain_modes_file):
+        try:
+            with filelock.FileLock("%s.lock" % file):
+                with open(file, 'rb') as f:
+                    langchain_modes_from_file, visible_langchain_modes_from_file, langchain_mode_paths_from_file = pickle.load(
+                        f)
+        except BaseException as e:
+            print("Cannot load %s, ignoring error: %s" % (file, str(e)), flush=True)
+    for k, v in langchain_mode_paths_from_file.items():
+        if v is not None and not os.path.isdir(v) and isinstance(v, str):
+            # assume was deleted, but need to make again to avoid extra code elsewhere
+            makedirs(v)
+    return langchain_modes_from_file, visible_langchain_modes_from_file, langchain_mode_paths_from_file
+def remove_collection_enum():
+    remove(visible_langchain_modes_file)