InfoFusion

Runtime error

App Files Files Community

pseudotensor commited on Jun 4, 2023

Commit

30e5d19

•

1 Parent(s): 9bcca78

Update with h2oGPT hash 03227623260f552fd7e2b8c51409308bc7242933

Browse files

Files changed (13) hide show

client_test.py +42 -19
create_data.py +60 -69
finetune.py +7 -11
generate.py +236 -242
gpt4all_llm.py +162 -26
gpt_langchain.py +561 -183
gradio_runner.py +252 -110
gradio_themes.py +41 -2
h2oai_pipeline.py +96 -22
prompter.py +119 -22
requirements.txt +12 -11
stopping.py +6 -4
utils.py +83 -8

client_test.py CHANGED Viewed

@@ -23,7 +23,7 @@ HOST="https://h2oai-h2ogpt-chatbot.hf.space" python client_test.py
 Result:
 Loaded as API: https://h2oai-h2ogpt-chatbot.hf.space ✔
-{'instruction_nochat': 'Who are you?', 'iinput_nochat': '', 'response': 'I am h2oGPT, a large language model developed by LAION.'}
 For demo:
@@ -33,9 +33,15 @@ HOST="https://gpt.h2o.ai" python client_test.py
 Result:
 Loaded as API: https://gpt.h2o.ai ✔
-{'instruction_nochat': 'Who are you?', 'iinput_nochat': '', 'response': 'I am h2oGPT, a chatbot created by LAION.'}
 """
 import time
 import os
 import markdown  # pip install markdown
@@ -56,7 +62,7 @@ def get_client(serialize=True):
     return client
-def get_args(prompt, prompt_type, chat=False, stream_output=False, max_new_tokens=50):
     from collections import OrderedDict
     kwargs = OrderedDict(instruction=prompt if chat else '',  # only for chat=True
                          iinput='',  # only for chat=True
@@ -79,12 +85,13 @@ def get_args(prompt, prompt_type, chat=False, stream_output=False, max_new_token
                          chat=chat,
                          instruction_nochat=prompt if not chat else '',
                          iinput_nochat='',  # only for chat=False
-                         langchain_mode='Disabled',
                          document_choice=['All'],
                          )
     if chat:
         # add chatbot output on end.  Assumes serialize=False
-        kwargs.update(dict(chatbot=[['', None]]))
     return kwargs, list(kwargs.values())
@@ -103,22 +110,29 @@ def run_client_nochat(prompt, prompt_type, max_new_tokens):
         *tuple(args),
         api_name=api_name,
     )
     res_dict = dict(prompt=kwargs['instruction_nochat'], iinput=kwargs['iinput_nochat'],
-                    response=md_to_text(res))
     print(res_dict)
     return res_dict
 @pytest.mark.skip(reason="For manual use against some server, no server launched")
 def test_client_chat():
-    return run_client_chat(prompt='Who are you?', prompt_type='human_bot', stream_output=False, max_new_tokens=50)
-def run_client_chat(prompt, prompt_type, stream_output, max_new_tokens):
-    kwargs, args = get_args(prompt, prompt_type, chat=True, stream_output=stream_output, max_new_tokens=max_new_tokens)
     client = get_client(serialize=False)
     res = client.predict(*tuple(args), api_name='/instruction')
     args[-1] += [res[-1]]
@@ -127,8 +141,8 @@ def run_client_chat(prompt, prompt_type, stream_output, max_new_tokens):
     if not kwargs['stream_output']:
         res = client.predict(*tuple(args), api_name='/instruction_bot')
         res_dict['response'] = res[0][-1][1]
-        print(md_to_text(res_dict['response']))
-        return res_dict
     else:
         job = client.submit(*tuple(args), api_name='/instruction_bot')
         res1 = ''
@@ -137,15 +151,24 @@ def run_client_chat(prompt, prompt_type, stream_output, max_new_tokens):
             if outputs_list:
                 res = job.communicator.job.outputs[-1]
                 res1 = res[0][-1][-1]
-                res1 = md_to_text(res1)
                 print(res1)
             time.sleep(0.1)
-        print(job.outputs())
-        res_dict['response'] = res1
-        return res_dict
-def md_to_text(md):
     assert md is not None, "Markdown is None"
     html = markdown.markdown(md)
     soup = BeautifulSoup(html, features='html.parser')

 Result:
 Loaded as API: https://h2oai-h2ogpt-chatbot.hf.space ✔
+{'instruction_nochat': 'Who are you?', 'iinput_nochat': '', 'response': 'I am h2oGPT, a large language model developed by LAION.', 'sources': ''}
 For demo:
 Result:
 Loaded as API: https://gpt.h2o.ai ✔
+{'instruction_nochat': 'Who are you?', 'iinput_nochat': '', 'response': 'I am h2oGPT, a chatbot created by LAION.', 'sources': ''}
+NOTE: Raw output from API for nochat case is a string of a python dict and will remain so if other entries are added to dict:
+{'response': "I'm h2oGPT, a large language model by H2O.ai, the visionary leader in democratizing AI.", 'sources': ''}
 """
+import ast
 import time
 import os
 import markdown  # pip install markdown
     return client
+def get_args(prompt, prompt_type, chat=False, stream_output=False, max_new_tokens=50, langchain_mode='Disabled'):
     from collections import OrderedDict
     kwargs = OrderedDict(instruction=prompt if chat else '',  # only for chat=True
                          iinput='',  # only for chat=True
                          chat=chat,
                          instruction_nochat=prompt if not chat else '',
                          iinput_nochat='',  # only for chat=False
+                         langchain_mode=langchain_mode,
+                         top_k_docs=4,
                          document_choice=['All'],
                          )
     if chat:
         # add chatbot output on end.  Assumes serialize=False
+        kwargs.update(dict(chatbot=[]))
     return kwargs, list(kwargs.values())
         *tuple(args),
         api_name=api_name,
     )
+    print("Raw client result: %s" % res, flush=True)
     res_dict = dict(prompt=kwargs['instruction_nochat'], iinput=kwargs['iinput_nochat'],
+                    response=md_to_text(ast.literal_eval(res)['response']),
+                    sources=ast.literal_eval(res)['sources'])
     print(res_dict)
     return res_dict
 @pytest.mark.skip(reason="For manual use against some server, no server launched")
 def test_client_chat():
+    return run_client_chat(prompt='Who are you?', prompt_type='human_bot', stream_output=False, max_new_tokens=50,
+                           langchain_mode='Disabled')
+def run_client_chat(prompt, prompt_type, stream_output, max_new_tokens, langchain_mode):
     client = get_client(serialize=False)
+    kwargs, args = get_args(prompt, prompt_type, chat=True, stream_output=stream_output,
+                            max_new_tokens=max_new_tokens, langchain_mode=langchain_mode)
+    return run_client(client, prompt, args, kwargs)
+def run_client(client, prompt, args, kwargs, do_md_to_text=True, verbose=False):
     res = client.predict(*tuple(args), api_name='/instruction')
     args[-1] += [res[-1]]
     if not kwargs['stream_output']:
         res = client.predict(*tuple(args), api_name='/instruction_bot')
         res_dict['response'] = res[0][-1][1]
+        print(md_to_text(res_dict['response'], do_md_to_text=do_md_to_text))
+        return res_dict, client
     else:
         job = client.submit(*tuple(args), api_name='/instruction_bot')
         res1 = ''
             if outputs_list:
                 res = job.communicator.job.outputs[-1]
                 res1 = res[0][-1][-1]
+                res1 = md_to_text(res1, do_md_to_text=do_md_to_text)
                 print(res1)
             time.sleep(0.1)
+        full_outputs = job.outputs()
+        if verbose:
+            print('job.outputs: %s' % str(full_outputs))
+        # ensure get ending to avoid race
+        # -1 means last response if streaming
+        # 0 means get text_output, ignore exception_text
+        # 0 means get list within text_output that looks like [[prompt], [answer]]
+        # 1 means get bot answer, so will have last bot answer
+        res_dict['response'] = md_to_text(full_outputs[-1][0][0][1], do_md_to_text=do_md_to_text)
+        return res_dict, client
+def md_to_text(md, do_md_to_text=True):
+    if not do_md_to_text:
+        return md
     assert md is not None, "Markdown is None"
     html = markdown.markdown(md)
     soup = BeautifulSoup(html, features='html.parser')

create_data.py CHANGED Viewed

@@ -23,7 +23,7 @@ import pandas as pd
 import numpy as np
 from tqdm import tqdm
-from utils import flatten_list
 def parse_rst_file(filepath):
@@ -184,7 +184,7 @@ def setup_dai_docs(path=None, dst="working_dir_docs", from_hf=False):
     return dst
-def rst_to_outputs(files, min_len=30, max_len=2048//2 - 30):
     # account for sequence length (context window) including prompt and input and output
     # os.system('pandoc -f rst -t plain ./expert_settings/nlp_settings.rst')
@@ -274,22 +274,6 @@ def test_scrape_dai_docs_all_pandoc():
         f.write(json.dumps(save_thing, indent=2))
-def remove(path: str):
-    try:
-        if path is not None and os.path.exists(path):
-            if os.path.isdir(path):
-                shutil_rmtree(path, ignore_errors=True)
-            else:
-                with contextlib.suppress(FileNotFoundError):
-                    os.remove(path)
-    except:
-        pass
-def shutil_rmtree(*args, **kwargs):
-    return shutil.rmtree(*args, **kwargs)
 def test_config_to_json():
     """
     Needs to run from Driverless AI source directory.
@@ -310,15 +294,18 @@ def test_config_to_json():
                 [
                     {
                         'prompt_type': 'plain',
-                        'instruction': f"<human>: What does {k} do?\n<bot>: {k.replace('_', ' ')} config.toml:  {comment or title}\n<human>:".replace("\n", ""),
                     },
                     {
                         'prompt_type': 'plain',
-                        'instruction': f"<human>: Explain {k}.\n<bot>: {k.replace('_', ' ')} config.toml:  {comment or title}\n<human>:".replace("\n", ""),
                     },
                     {
                         'prompt_type': 'plain',
-                        'instruction': f"<human>: How can I do this: {title}.\n<bot>: Set the {k.replace('_', ' ')} config.toml\n<human>:".replace("\n", ""),
                     } if title and comment else None,
                     {
                         'prompt_type': 'human_bot',
@@ -420,7 +407,8 @@ def test_prep_instruct_vicuna():
     from datasets import load_dataset
     filename = 'ShareGPT_unfiltered_cleaned_split.json'
     if not os.path.exists(filename):
-        os.system('wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/%s' % filename)
     data = load_dataset("json", data_files={"train": filename})["train"]
     training_rows = []
     for i in range(data.num_rows):
@@ -440,6 +428,7 @@ def test_prep_instruct_vicuna():
     with open(filename + ".generate_human_bot.train_plain.json", "wt") as f:
         f.write(json.dumps(training_rows, indent=2))
 POSTFIX = ".generate_human_bot.train_plain.json"
 # https://bair.berkeley.edu/blog/2023/04/03/koala/
@@ -497,10 +486,10 @@ useful_oig_files = ['unified_rallio_safety_and_prosocial.jsonl.parquet',
                     'unified_mathqa_flanv2_kojma_cot.jsonl.parquet',
                     'unified_merged_code_xp3.jsonl.parquet',
                     'unified_multi_news.jsonl.parquet',
-                    #'unified_multi_sum.jsonl.parquet'
                     'unified_ni.jsonl.gz.parquet',
                     'unified_openai_summarize_tldr.jsonl.parquet',
-                    #'unified_oscar_en_sample_dialog.jsonl.parquet', # create text containing these N words, not specific
                     'unified_plot_screenplay_books_dialog.jsonl.parquet',
                     'unified_soda_dialog.jsonl.parquet',
                     'unified_unnatural_instructions.jsonl.parquet',
@@ -546,8 +535,8 @@ def test_merge_shuffle_small_sample_oig_data():
 def test_join_jsons():
     files = ['config.json'] * 1 + \
-             ['dai_docs.train_cleaned.json'] * 2 + \
-             ['dai_faq.json'] * 3
     print(files)
     lst = []
     [lst.extend(json.load(open(fil, 'rt'))) for fil in files]
@@ -570,11 +559,10 @@ def test_make_rlhf_good_data(filename):
         f.write(json.dumps(new_rows, indent=2))
 def test_show_prompts():
     files = ['config.json'] * 1 + \
-             ['dai_docs.train_cleaned.json'] * 1 + \
-             ['dai_faq.json'] * 1
     file_points = [json.load(open(fil, 'rt')) for fil in files]
     from prompter import generate_prompt
     for data_points in file_points:
@@ -600,7 +588,7 @@ def test_get_open_datasets():
                  'license:openrail++',
                  'license:openrail',
                  'license:bigscience-bloom-rail-1.0',
-                 #'license:agpl-3.0',
                  'license:other',
                  'license:unknown',
                  # 'license:mpl-2.0',     # ok, but would have to include original copyright, license, source, copies in distribution
@@ -610,13 +598,13 @@ def test_get_open_datasets():
                  'license:cc-by-3.0',
                  'license:cc-by-2.0',
                  'license:cc-by-2.5',
-                 #'license:cc-by-sa-4.0',  # would require same license
                  'license:odbl',
                  'license:pddl',
                  'license:ms-pl',
                  'license:zlib',
                  ]
-                 # bad license: cc-by-nc-4.0
     from huggingface_hub import list_datasets
     datasets = flatten_list([[x for x in list_datasets(filter=y)] for y in open_tags])
@@ -656,12 +644,12 @@ def test_get_open_datasets():
                                     'language:' not in str(x.tags) or
                                     'language:en' in str(x.tags)]
     small_open_english_tasked_datasets = [x for x in open_english_tasked_datasets if
-                                    'n<1K' in str(x.tags) or
-                                    '1K<n<10K' in str(x.tags) or
-                                    '1K0<n<100K' in str(x.tags) or
-                                    '100K<n<1M' in str(x.tags) or
-                                    'size_category' not in str(x.tags)
-                                    ]
     # 'aeslc' : email_body, subject -> summarization?
     # load_dataset(open_tasked_datasets[0].id).data['train'].to_pandas()
     ids = [x.id for x in small_open_english_tasked_datasets]
@@ -689,7 +677,8 @@ def test_get_open_datasets():
                    'humarin/chatgpt-paraphrases',  # Paraphrase using ChatGPT
                    'Jeska/vaccinchat',  # not useful
                    'alespalla/chatbot_instruction_prompts',  # mixes alpaca
-                   'allenai/prosocial-dialog',  # already exlucded, but wrongly in other datasets that say more permissive license
                    'AlekseyKorshuk/persona-chat',  # low quality
                    'bavard/personachat_truecased',  # low quality
                    'adamlin/daily_dialog',  # medium quality conversations
@@ -724,7 +713,8 @@ def test_get_open_datasets():
     # some ids clearly speech related
     small_open_english_tasked_datasets = [x for x in small_open_english_tasked_datasets if 'speech' not in x.id]
     # HF testing
-    small_open_english_tasked_datasets = [x for x in small_open_english_tasked_datasets if 'hf-internal-testing' not in x.id]
     small_open_english_tasked_datasets = [x for x in small_open_english_tasked_datasets if
                                           'chinese' not in x.id]
@@ -738,7 +728,6 @@ def test_get_open_datasets():
     # grep "pip install" getdata9.log
     # NOTE: Some datasets have default config, but others are there.  Don't know how to access them.
     """
     https://huggingface.co/datasets/wikihow/blob/main/wikihow.py
     https://github.com/mahnazkoupaee/WikiHow-Dataset
@@ -773,7 +762,7 @@ def test_get_open_datasets():
 def do_one(data_id, num_downloads):
     from datasets import load_dataset
     out_file = "data_%s.parquet" % str(data_id.replace('/', '_'))
-    if os.path.isfile(out_file) and os.path.getsize(out_file) > 1024**3:
         return
     try:
         print("Loading data_id %s num_downloads: %s" % (data_id, num_downloads), flush=True)
@@ -881,23 +870,21 @@ useful = ['Dahoas/instruct-human-assistant-prompt',
           'lmqg/qg_squad',  # context QA
           'lmqg/qg_squadshifts',  # context QA
           'lmqg/qg_subjqa',  # context QA
-          'pszemraj/HC3-textgen-qa',  # QA medium, has human responses -- humans tend to provide links instead of trying to answer
           'pythonist/newdata',  # long context, QA, brief A
           'ropes',  # long background, situation, question, A
           'wikitablequestions',  # table -> QA
           'bigscience/p3',  # context QA but short answers
           ]
 code_useful = ['0n1xus/codexglue',
                'openai_humaneval',
                'koutch/staqc',
                ]
 maybe_useful = ['AlekseyKorshuk/comedy-scripts',
-                 'openbookqa',  # hard to parse, low reasoning
                 'qed',  # reasonable QA, but low reasoning
                 'selqa',  # candidate answers
                 'HuggingFaceH4/instruction-pilot-outputs-filtered',
@@ -905,7 +892,6 @@ maybe_useful = ['AlekseyKorshuk/comedy-scripts',
                 'npc-engine/light-batch-summarize-dialogue',  # dialog summarize, kinda low specific quality
                 ]
 summary_useful = ['austin/rheum_abstracts',
                   'CarperAI/openai_summarize_comparisons',  # summarize chosen/rejected
                   'CarperAI/openai_summarize_tldr',  # summarize QA
@@ -928,14 +914,12 @@ summary_useful = ['austin/rheum_abstracts',
                   'stacked-summaries/stacked-xsum-1024',
                   ]
 math_useful = [
-              'competition_math'
-              ]
 skipped = ['c4',  # maybe useful, used for flan, but skipped due to size
-          ]
 """
 To get training data from oig:
@@ -958,14 +942,14 @@ def test_assemble_and_detox():
         text_list = df[['text']].values.ravel().tolist()
         new_text = []
         max_len = 2048  # uber cutoff
-        MAX_LEN = 2048//2 - 30  # max len per question/answer
         for text in tqdm(text_list):
             human_starts = [m.start() for m in re.finditer('<human>: ', text)]
             if len(human_starts) == 1:
                 human_starts = [0, len(text)]  # always go into for loop below
             blurb = ''
             for i in range(len(human_starts) - 1):
-                interaction = text[human_starts[i]: human_starts[i+1]][:max_len]
                 blurb += interaction
                 if len(blurb) >= MAX_LEN:
                     blurb = get_sentences(blurb, length=MAX_LEN)[0]
@@ -1002,17 +986,17 @@ def test_basic_cleaning():
     from profanity_check import predict
     df_list = []
     for data in useful_oig_files:
-    #for data in useful_oig_files[:5]:
-    #for data in ['unified_openai_summarize_tldr.jsonl.parquet']:
         print("Processing %s" % data, flush=True)
         df = pd.read_parquet(data)
         df = df.reset_index(drop=True)
         # NOTE: Not correct if multiple human-bot interactions, but those dialogs even more desired
-        #avg_chars = len(df['text'][0])/(df['text'][0].count(human)+df['text'][0].count(bot))
-        df['avg_words'] = df['text'].apply(lambda x: x.count(' ') / (x.count(human) + x.count(bot))/2.0)
         df['avg_bot_words'] = df['text'].apply(lambda x: x.split(bot)[1].count(' ') / x.count(bot))
-        #df['bad_words'] = df['text'].apply(lambda x: profanity.contains_profanity(x))
-        #low_quality_patterns = ['Write the rest of this wikipedia article']
         res = predict(df['text'])
         df['bad_words'] = res
         df = df.reset_index(drop=True)
@@ -1215,7 +1199,7 @@ def count_human_bot_lengths(df, human=None, bot=None):
             assert len(text)
             list_what = []
             for ii in range(len(starts) - 1):
-                interaction = text[starts[ii]: starts[ii+1]]
                 if other in interaction:
                     interaction = interaction[:interaction.find(other)]
                 interaction.strip()
@@ -1416,9 +1400,13 @@ def test_add_open_assistant(fixup_personality, only_personality, deberta_grading
                         conv2['message_id'] = None
         conversations = [c for c in conversations if c['message_id']]
         if only_personality:
-            all_rows.extend([dict(input=c['text'] + "\n<human>:", prompt_type='plain', source=data_file) for c in conversations if 'h2oGPT' in c['text']])
         else:
-            all_rows.extend([dict(input=c['text'] + "\n<human>:", prompt_type='plain', source=data_file) for c in conversations if "What is H2O.ai" not in c['text']])
     unhelpful = get_unhelpful_list()
     all_rows = [x for x in all_rows if not any(u in x['input'] for u in unhelpful)]
     personality = create_personality_data()
@@ -1484,6 +1472,7 @@ def test_finalize_to_json():
             n_jobs=-1,
         )
         return df[(df['profanity'] == 0)].reset_index(drop=True)
     print("Before cleaning: Number of final high-quality human_bot interactions: %s" % df.shape[0], flush=True)
     df = final_clean(df)
     print("After cleaning: Number of final high-quality human_bot interactions: %s" % df.shape[0], flush=True)
@@ -1721,7 +1710,7 @@ def test_check_unhelpful():
     # file = 'h2ogpt-oig-oasst1-instruct-cleaned-v2.json'
     unhelpful = get_unhelpful_list()
-    #data = json.load(open(file, 'rt'))
     df = pd.read_json(file)
     use_reward_score_threshold = False
@@ -1733,7 +1722,7 @@ def test_check_unhelpful():
     from nltk.translate.bleu_score import sentence_bleu
     def get_bleu(actual, expected_list):
-        #return bleu.sentence_score(actual, expected_list).score
         return sentence_bleu(expected_list, actual)
     threshold = 0.0
@@ -1770,12 +1759,13 @@ def test_check_unhelpful():
         # pip install sentence_transformers-2.2.2
         from sentence_transformers import SentenceTransformer
         # sent_model = 'bert-base-nli-mean-tokens'
-        #sent_model = 'nli-distilroberta-base-v2'
         sent_model = 'all-MiniLM-L6-v2'
         model = SentenceTransformer(sent_model)
         sentence_embeddings = model.encode(unhelpful)
         from sklearn.metrics.pairwise import cosine_similarity
-        bots = [x for x in tqdm(bots) if np.max(cosine_similarity(model.encode(x), sentence_embeddings)) < cosine_sim_threshold]
     bads_bots = {}
     string_all = str(bots)
@@ -1787,7 +1777,8 @@ def test_check_unhelpful():
     pp.pprint(bads_bots)
     total_bads_bots = sum(list(bads_bots.values()))
-    print('threshold: %g use_bleu_threshold: %g total_bads_bots: %s total_bots: %s total_humans: %s' % (threshold, use_bleu_threshold, total_bads_bots, len(bots), len(humans)), flush=True)
     # assert len(bads) == 0, bads
     assert len(bads_bots) == 0, bads_bots

 import numpy as np
 from tqdm import tqdm
+from utils import flatten_list, remove
 def parse_rst_file(filepath):
     return dst
+def rst_to_outputs(files, min_len=30, max_len=2048 // 2 - 30):
     # account for sequence length (context window) including prompt and input and output
     # os.system('pandoc -f rst -t plain ./expert_settings/nlp_settings.rst')
         f.write(json.dumps(save_thing, indent=2))
 def test_config_to_json():
     """
     Needs to run from Driverless AI source directory.
                 [
                     {
                         'prompt_type': 'plain',
+                        'instruction': f"<human>: What does {k} do?\n<bot>: {k.replace('_', ' ')} config.toml:  {comment or title}\n<human>:".replace(
+                            "\n", ""),
                     },
                     {
                         'prompt_type': 'plain',
+                        'instruction': f"<human>: Explain {k}.\n<bot>: {k.replace('_', ' ')} config.toml:  {comment or title}\n<human>:".replace(
+                            "\n", ""),
                     },
                     {
                         'prompt_type': 'plain',
+                        'instruction': f"<human>: How can I do this: {title}.\n<bot>: Set the {k.replace('_', ' ')} config.toml\n<human>:".replace(
+                            "\n", ""),
                     } if title and comment else None,
                     {
                         'prompt_type': 'human_bot',
     from datasets import load_dataset
     filename = 'ShareGPT_unfiltered_cleaned_split.json'
     if not os.path.exists(filename):
+        os.system(
+            'wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/%s' % filename)
     data = load_dataset("json", data_files={"train": filename})["train"]
     training_rows = []
     for i in range(data.num_rows):
     with open(filename + ".generate_human_bot.train_plain.json", "wt") as f:
         f.write(json.dumps(training_rows, indent=2))
 POSTFIX = ".generate_human_bot.train_plain.json"
 # https://bair.berkeley.edu/blog/2023/04/03/koala/
                     'unified_mathqa_flanv2_kojma_cot.jsonl.parquet',
                     'unified_merged_code_xp3.jsonl.parquet',
                     'unified_multi_news.jsonl.parquet',
+                    # 'unified_multi_sum.jsonl.parquet'
                     'unified_ni.jsonl.gz.parquet',
                     'unified_openai_summarize_tldr.jsonl.parquet',
+                    # 'unified_oscar_en_sample_dialog.jsonl.parquet', # create text containing these N words, not specific
                     'unified_plot_screenplay_books_dialog.jsonl.parquet',
                     'unified_soda_dialog.jsonl.parquet',
                     'unified_unnatural_instructions.jsonl.parquet',
 def test_join_jsons():
     files = ['config.json'] * 1 + \
+            ['dai_docs.train_cleaned.json'] * 2 + \
+            ['dai_faq.json'] * 3
     print(files)
     lst = []
     [lst.extend(json.load(open(fil, 'rt'))) for fil in files]
         f.write(json.dumps(new_rows, indent=2))
 def test_show_prompts():
     files = ['config.json'] * 1 + \
+            ['dai_docs.train_cleaned.json'] * 1 + \
+            ['dai_faq.json'] * 1
     file_points = [json.load(open(fil, 'rt')) for fil in files]
     from prompter import generate_prompt
     for data_points in file_points:
                  'license:openrail++',
                  'license:openrail',
                  'license:bigscience-bloom-rail-1.0',
+                 # 'license:agpl-3.0',
                  'license:other',
                  'license:unknown',
                  # 'license:mpl-2.0',     # ok, but would have to include original copyright, license, source, copies in distribution
                  'license:cc-by-3.0',
                  'license:cc-by-2.0',
                  'license:cc-by-2.5',
+                 # 'license:cc-by-sa-4.0',  # would require same license
                  'license:odbl',
                  'license:pddl',
                  'license:ms-pl',
                  'license:zlib',
                  ]
+    # bad license: cc-by-nc-4.0
     from huggingface_hub import list_datasets
     datasets = flatten_list([[x for x in list_datasets(filter=y)] for y in open_tags])
                                     'language:' not in str(x.tags) or
                                     'language:en' in str(x.tags)]
     small_open_english_tasked_datasets = [x for x in open_english_tasked_datasets if
+                                          'n<1K' in str(x.tags) or
+                                          '1K<n<10K' in str(x.tags) or
+                                          '1K0<n<100K' in str(x.tags) or
+                                          '100K<n<1M' in str(x.tags) or
+                                          'size_category' not in str(x.tags)
+                                          ]
     # 'aeslc' : email_body, subject -> summarization?
     # load_dataset(open_tasked_datasets[0].id).data['train'].to_pandas()
     ids = [x.id for x in small_open_english_tasked_datasets]
                    'humarin/chatgpt-paraphrases',  # Paraphrase using ChatGPT
                    'Jeska/vaccinchat',  # not useful
                    'alespalla/chatbot_instruction_prompts',  # mixes alpaca
+                   'allenai/prosocial-dialog',
+                   # already exlucded, but wrongly in other datasets that say more permissive license
                    'AlekseyKorshuk/persona-chat',  # low quality
                    'bavard/personachat_truecased',  # low quality
                    'adamlin/daily_dialog',  # medium quality conversations
     # some ids clearly speech related
     small_open_english_tasked_datasets = [x for x in small_open_english_tasked_datasets if 'speech' not in x.id]
     # HF testing
+    small_open_english_tasked_datasets = [x for x in small_open_english_tasked_datasets if
+                                          'hf-internal-testing' not in x.id]
     small_open_english_tasked_datasets = [x for x in small_open_english_tasked_datasets if
                                           'chinese' not in x.id]
     # grep "pip install" getdata9.log
     # NOTE: Some datasets have default config, but others are there.  Don't know how to access them.
     """
     https://huggingface.co/datasets/wikihow/blob/main/wikihow.py
     https://github.com/mahnazkoupaee/WikiHow-Dataset
 def do_one(data_id, num_downloads):
     from datasets import load_dataset
     out_file = "data_%s.parquet" % str(data_id.replace('/', '_'))
+    if os.path.isfile(out_file) and os.path.getsize(out_file) > 1024 ** 3:
         return
     try:
         print("Loading data_id %s num_downloads: %s" % (data_id, num_downloads), flush=True)
           'lmqg/qg_squad',  # context QA
           'lmqg/qg_squadshifts',  # context QA
           'lmqg/qg_subjqa',  # context QA
+          'pszemraj/HC3-textgen-qa',
+          # QA medium, has human responses -- humans tend to provide links instead of trying to answer
           'pythonist/newdata',  # long context, QA, brief A
           'ropes',  # long background, situation, question, A
           'wikitablequestions',  # table -> QA
           'bigscience/p3',  # context QA but short answers
           ]
 code_useful = ['0n1xus/codexglue',
                'openai_humaneval',
                'koutch/staqc',
                ]
 maybe_useful = ['AlekseyKorshuk/comedy-scripts',
+                'openbookqa',  # hard to parse, low reasoning
                 'qed',  # reasonable QA, but low reasoning
                 'selqa',  # candidate answers
                 'HuggingFaceH4/instruction-pilot-outputs-filtered',
                 'npc-engine/light-batch-summarize-dialogue',  # dialog summarize, kinda low specific quality
                 ]
 summary_useful = ['austin/rheum_abstracts',
                   'CarperAI/openai_summarize_comparisons',  # summarize chosen/rejected
                   'CarperAI/openai_summarize_tldr',  # summarize QA
                   'stacked-summaries/stacked-xsum-1024',
                   ]
 math_useful = [
+    'competition_math'
+]
 skipped = ['c4',  # maybe useful, used for flan, but skipped due to size
+           ]
 """
 To get training data from oig:
         text_list = df[['text']].values.ravel().tolist()
         new_text = []
         max_len = 2048  # uber cutoff
+        MAX_LEN = 2048 // 2 - 30  # max len per question/answer
         for text in tqdm(text_list):
             human_starts = [m.start() for m in re.finditer('<human>: ', text)]
             if len(human_starts) == 1:
                 human_starts = [0, len(text)]  # always go into for loop below
             blurb = ''
             for i in range(len(human_starts) - 1):
+                interaction = text[human_starts[i]: human_starts[i + 1]][:max_len]
                 blurb += interaction
                 if len(blurb) >= MAX_LEN:
                     blurb = get_sentences(blurb, length=MAX_LEN)[0]
     from profanity_check import predict
     df_list = []
     for data in useful_oig_files:
+        # for data in useful_oig_files[:5]:
+        # for data in ['unified_openai_summarize_tldr.jsonl.parquet']:
         print("Processing %s" % data, flush=True)
         df = pd.read_parquet(data)
         df = df.reset_index(drop=True)
         # NOTE: Not correct if multiple human-bot interactions, but those dialogs even more desired
+        # avg_chars = len(df['text'][0])/(df['text'][0].count(human)+df['text'][0].count(bot))
+        df['avg_words'] = df['text'].apply(lambda x: x.count(' ') / (x.count(human) + x.count(bot)) / 2.0)
         df['avg_bot_words'] = df['text'].apply(lambda x: x.split(bot)[1].count(' ') / x.count(bot))
+        # df['bad_words'] = df['text'].apply(lambda x: profanity.contains_profanity(x))
+        # low_quality_patterns = ['Write the rest of this wikipedia article']
         res = predict(df['text'])
         df['bad_words'] = res
         df = df.reset_index(drop=True)
             assert len(text)
             list_what = []
             for ii in range(len(starts) - 1):
+                interaction = text[starts[ii]: starts[ii + 1]]
                 if other in interaction:
                     interaction = interaction[:interaction.find(other)]
                 interaction.strip()
                         conv2['message_id'] = None
         conversations = [c for c in conversations if c['message_id']]
         if only_personality:
+            all_rows.extend(
+                [dict(input=c['text'] + "\n<human>:", prompt_type='plain', source=data_file) for c in conversations if
+                 'h2oGPT' in c['text']])
         else:
+            all_rows.extend(
+                [dict(input=c['text'] + "\n<human>:", prompt_type='plain', source=data_file) for c in conversations if
+                 "What is H2O.ai" not in c['text']])
     unhelpful = get_unhelpful_list()
     all_rows = [x for x in all_rows if not any(u in x['input'] for u in unhelpful)]
     personality = create_personality_data()
             n_jobs=-1,
         )
         return df[(df['profanity'] == 0)].reset_index(drop=True)
     print("Before cleaning: Number of final high-quality human_bot interactions: %s" % df.shape[0], flush=True)
     df = final_clean(df)
     print("After cleaning: Number of final high-quality human_bot interactions: %s" % df.shape[0], flush=True)
     # file = 'h2ogpt-oig-oasst1-instruct-cleaned-v2.json'
     unhelpful = get_unhelpful_list()
+    # data = json.load(open(file, 'rt'))
     df = pd.read_json(file)
     use_reward_score_threshold = False
     from nltk.translate.bleu_score import sentence_bleu
     def get_bleu(actual, expected_list):
+        # return bleu.sentence_score(actual, expected_list).score
         return sentence_bleu(expected_list, actual)
     threshold = 0.0
         # pip install sentence_transformers-2.2.2
         from sentence_transformers import SentenceTransformer
         # sent_model = 'bert-base-nli-mean-tokens'
+        # sent_model = 'nli-distilroberta-base-v2'
         sent_model = 'all-MiniLM-L6-v2'
         model = SentenceTransformer(sent_model)
         sentence_embeddings = model.encode(unhelpful)
         from sklearn.metrics.pairwise import cosine_similarity
+        bots = [x for x in tqdm(bots) if
+                np.max(cosine_similarity(model.encode(x), sentence_embeddings)) < cosine_sim_threshold]
     bads_bots = {}
     string_all = str(bots)
     pp.pprint(bads_bots)
     total_bads_bots = sum(list(bads_bots.values()))
+    print('threshold: %g use_bleu_threshold: %g total_bads_bots: %s total_bots: %s total_humans: %s' % (
+    threshold, use_bleu_threshold, total_bads_bots, len(bots), len(humans)), flush=True)
     # assert len(bads) == 0, bads
     assert len(bads_bots) == 0, bads_bots

finetune.py CHANGED Viewed

@@ -65,7 +65,8 @@ def train(
         micro_batch_size: int = 4,
         gradient_checkpointing=False,  # unnecessary with gradient accumulation enabled
         fp16=True,
-        train_8bit=True,
         # general training hyperparams
         num_epochs: float = 1,
@@ -185,10 +186,12 @@ def train(
     model = model_loader.from_pretrained(
         base_model,
         load_in_8bit=train_8bit,
         device_map=device_map,
         torch_dtype=torch.float16,
         max_memory=max_memory,
         local_files_only=local_files_only,
         resume_download=resume_download,
         use_auth_token=use_auth_token,
     )
@@ -200,19 +203,12 @@ def train(
     tokenizer = get_tokenizer(tokenizer_loader, tokenizer_base_model, local_files_only, resume_download, use_auth_token)
-    if train_8bit:
         from peft import (
-            prepare_model_for_int8_training,
         )
-        if "gpt-neox" not in base_model or True:
-            model = prepare_model_for_int8_training(model)
-        else:
-            model = prepare_model_for_int8_training(
-                model,
-                output_embedding_layer_name="embed_out",  # keep output logits in float32
-                layer_norm_names=["layer_norm", "layernorm"],  # keep all layer norms in higher precision
-            )
     from peft import LoraConfig, get_peft_model, set_peft_model_state_dict
     try:

         micro_batch_size: int = 4,
         gradient_checkpointing=False,  # unnecessary with gradient accumulation enabled
         fp16=True,
+        train_8bit=False,
+        train_4bit=False,
         # general training hyperparams
         num_epochs: float = 1,
     model = model_loader.from_pretrained(
         base_model,
         load_in_8bit=train_8bit,
+        load_in_4bit=train_4bit,
         device_map=device_map,
         torch_dtype=torch.float16,
         max_memory=max_memory,
         local_files_only=local_files_only,
+        trust_remote_code=True,
         resume_download=resume_download,
         use_auth_token=use_auth_token,
     )
     tokenizer = get_tokenizer(tokenizer_loader, tokenizer_base_model, local_files_only, resume_download, use_auth_token)
+    if train_8bit or train_4bit:
         from peft import (
+            prepare_model_for_kbit_training,
         )
+        model = prepare_model_for_kbit_training(model)
     from peft import LoraConfig, get_peft_model, set_peft_model_state_dict
     try:

generate.py CHANGED Viewed

@@ -9,24 +9,25 @@ import os
 import time
 import traceback
 import typing
 from datetime import datetime
 import filelock
 import psutil
 from loaders import get_loaders
 from utils import set_seed, clear_torch_cache, save_generate_output, NullContext, wrapped_partial, EThread, get_githash, \
-    import_matplotlib, get_device, makedirs
 import_matplotlib()
-from matplotlib import pyplot as plt
 SEED = 1236
 set_seed(SEED)
-os.environ['HF_HUB_DISABLE_TELEMETRY'] = '1'
 from typing import Union
-import numpy as np
-import pandas as pd
 import fire
 import torch
@@ -34,7 +35,7 @@ from peft import PeftModel
 from transformers import GenerationConfig, AutoModel, TextIteratorStreamer
 from accelerate import init_empty_weights, infer_auto_device_map
-from prompter import Prompter, inv_prompt_type_to_model_lower
 from stopping import get_stopping
 eval_extra_columns = ['prompt', 'response', 'score']
@@ -47,12 +48,14 @@ scratch_base_dir = '/tmp/'
 def main(
         load_8bit: bool = False,
         load_half: bool = True,
         infer_devices: bool = True,
         base_model: str = '',
         tokenizer_base_model: str = '',
         lora_weights: str = "",
         gpu_id: int = 0,
         prompt_type: Union[int, str] = None,
         # input to generation
@@ -68,6 +71,7 @@ def main(
         early_stopping: Union[bool, str] = None,
         max_time: float = None,
         debug: bool = False,
         save_dir: str = None,
         share: bool = True,
@@ -80,15 +84,18 @@ def main(
         src_lang: str = "English",
         tgt_lang: str = "Russian",
         gradio: bool = True,
         gradio_avoid_processing_markdown: bool = False,
         chat: bool = True,
         chat_context: bool = False,
         stream_output: bool = True,
         show_examples: bool = None,
         verbose: bool = False,
-        h2ocolors: bool = True,
-        height: int = 400,
         show_lora: bool = True,
         login_mode_if_model0: bool = False,
         block_gradio_exit: bool = True,
@@ -107,13 +114,16 @@ def main(
         score_model: str = 'OpenAssistant/reward-model-deberta-v3-large-v2',
         auto_score: bool = True,
-        eval_sharegpt_prompts_only: int = 0,
-        eval_sharegpt_prompts_only_seed: int = 1234,
-        eval_sharegpt_as_output: bool = False,
         langchain_mode: str = 'Disabled',
         visible_langchain_modes: list = ['UserData', 'MyData'],
         user_path: str = None,
         load_db_if_exists: bool = True,
         keep_sources_in_context: bool = False,
         db_type: str = 'chroma',
@@ -127,7 +137,7 @@ def main(
         enable_sources_list: bool = True,
         chunk: bool = True,
         chunk_size: int = 512,
-        k: int = 4,
         n_jobs: int = -1,
         enable_captions: bool = True,
         captions_model: str = "Salesforce/blip-image-captioning-base",
@@ -138,12 +148,14 @@ def main(
     """
     :param load_8bit: load model in 8-bit using bitsandbytes
     :param load_half: load model in float16
     :param infer_devices: whether to control devices with gpu_id.  If False, then spread across GPUs
-    :param base_model: model HF-type name
-    :param tokenizer_base_model: tokenizer HF-type name
     :param lora_weights: LORA weights path/HF link
     :param gpu_id: if infer_devices, then use gpu_id for cuda device ID, or auto mode if gpu_id != -1
     :param prompt_type: type of prompt, usually matched to fine-tuned model or plain for foundational model
     :param temperature: generation temperature
     :param top_p: generation top_p
@@ -156,6 +168,7 @@ def main(
     :param min_new_tokens: generation min tokens
     :param early_stopping: generation early stopping
     :param max_time: maximum time to allow for generation
     :param debug: enable debug mode
     :param save_dir: directory chat data is saved to
     :param share: whether to share the gradio app with sharable URL
@@ -166,8 +179,16 @@ def main(
     :param offload_folder: path for spilling model onto disk
     :param src_lang: source languages to include if doing translation (None = all)
     :param tgt_lang: target languages to include if doing translation (None = all)
     :param gradio: whether to enable gradio, or to enable benchmark mode
     :param gradio_avoid_processing_markdown:
     :param chat: whether to enable chat mode with chat history
     :param chat_context: whether to use extra helpful context if human_bot
     :param stream_output: whether to stream output from generate
@@ -190,32 +211,37 @@ def main(
     :param extra_lora_options: extra LORA to show in list in gradio
     :param score_model: which model to score responses (None means no scoring)
     :param auto_score: whether to automatically score responses
-    :param eval_sharegpt_prompts_only: for no gradio benchmark, if using ShareGPT prompts for eval
-    :param eval_sharegpt_prompts_only_seed: for no gradio benchmark, if seed for ShareGPT sampling
-    :param eval_sharegpt_as_output: for no gradio benchmark, whether to test ShareGPT output itself
     :param langchain_mode: Data source to include.  Choose "UserData" to only consume files from make_db.py.
            WARNING: wiki_full requires extra data processing via read_wiki_full.py and requires really good workstation to generate db, unless already present.
-    :param user_path: user path to glob from to generate db for vector search, for 'UserData' langchain mode
     :param visible_langchain_modes: dbs to generate at launch to be ready for LLM
            Can be up to ['wiki', 'wiki_full', 'UserData', 'MyData', 'github h2oGPT', 'DriverlessAI docs']
            But wiki_full is expensive and requires preparation
            To allow scratch space only live in session, add 'MyData' to list
            Default: If only want to consume local files, e.g. prepared by make_db.py, only include ['UserData']
            FIXME: Avoid 'All' for now, not implemented
     :param load_db_if_exists: Whether to load chroma db if exists or re-generate db
     :param keep_sources_in_context: Whether to keep url sources in context, not helpful usually
-    :param db_type: 'faiss' for in-memory or 'chroma' for persisted on disk
     :param use_openai_embedding: Whether to use OpenAI embeddings for vector db
     :param use_openai_model: Whether to use OpenAI model for use with vector db
     :param hf_embedding_model: Which HF embedding model to use for vector db
     :param allow_upload_to_user_data: Whether to allow file uploads to update shared vector db
     :param allow_upload_to_my_data: Whether to allow file uploads to update scratch vector db
     :param enable_url_upload: Whether to allow upload from URL
-    :param enable_text_upload: Whether to allow uplaod of text
     :param enable_sources_list: Whether to allow list (or download for non-shared db) of list of sources for chosen db
     :param chunk: Whether to chunk data (True unless know data is already optimally chunked)
     :param chunk_size: Size of chunks, with typically top-4 passed to LLM, so neesd to be in context length
-    :param k: number of chunks to give LLM
     :param n_jobs: Number of processors to use when consuming documents (-1 = all, is default)
     :param enable_captions: Whether to support captions using BLIP for image files as documents, then preloads that model
     :param captions_model: Which model to use for captions.
@@ -233,7 +259,10 @@ def main(
     is_hf = bool(os.getenv("HUGGINGFACE_SPACES"))
     is_gpth2oai = bool(os.getenv("GPT_H2O_AI"))
     is_public = is_hf or is_gpth2oai  # multi-user case with fixed model and disclaimer
-    is_low_mem = is_hf  # assumes run on 24GB consumer GPU
     admin_pass = os.getenv("ADMIN_PASS")
     # will sometimes appear in UI or sometimes actual generation, but maybe better than empty result
     # but becomes unrecoverable sometimes if raise, so just be silent for now
@@ -265,21 +294,23 @@ def main(
             # by default don't sample, too chatty
             do_sample = False if do_sample is None else do_sample
-        if is_low_mem:
             if not base_model:
                 base_model = 'h2oai/h2ogpt-oasst1-512-12b'
                 # don't set load_8bit if passed base_model, doesn't always work so can't just override
                 load_8bit = True
         else:
             base_model = 'h2oai/h2ogpt-oasst1-512-20b' if not base_model else base_model
-    if is_low_mem:
         load_8bit = True
     if is_hf:
         # must override share if in spaces
         share = False
     save_dir = os.getenv('SAVE_DIR', save_dir)
     score_model = os.getenv('SCORE_MODEL', score_model)
-    if score_model == 'None':
         score_model = ''
     concurrency_count = int(os.getenv('CONCURRENCY_COUNT', concurrency_count))
     api_open = bool(int(os.getenv('API_OPEN', api_open)))
@@ -289,6 +320,7 @@ def main(
     if n_gpus == 0:
         gpu_id = None
         load_8bit = False
         load_half = False
         infer_devices = False
         torch.backends.cudnn.benchmark = True
@@ -328,12 +360,15 @@ def main(
                             max_new_tokens, min_new_tokens, early_stopping, max_time,
                             repetition_penalty, num_return_sequences,
                             do_sample,
                             )
     locals_dict = locals()
     locals_print = '\n'.join(['%s: %s' % (k, v) for k, v in locals_dict.items()])
-    print(f"Generating model with params:\n{locals_print}", flush=True)
-    print("Command: %s\nHash: %s" % (str(' '.join(sys.argv)), get_githash()), flush=True)
     if langchain_mode != "Disabled":
         # SECOND PLACE where LangChain referenced, but all imports are kept local so not required
@@ -353,7 +388,9 @@ def main(
                 # FIXME: All should be avoided until scans over each db, shouldn't be separate db
                 continue
             persist_directory1 = 'db_dir_%s' % langchain_mode1  # single place, no special names for each case
-            db = prep_langchain(persist_directory1, load_db_if_exists, db_type, use_openai_embedding,
                                 langchain_mode1, user_path,
                                 hf_embedding_model,
                                 kwargs_make_db=locals())
@@ -367,174 +404,30 @@ def main(
             assert 'gpt_langchain' not in sys.modules, "Dev bug, import of langchain when should not have"
             assert 'langchain' not in sys.modules, "Dev bug, import of langchain when should not have"
-    if not gradio:
-        if eval_sharegpt_prompts_only > 0:
-            # override default examples with shareGPT ones for human-level eval purposes only
-            eval_filename = 'ShareGPT_V3_unfiltered_cleaned_split_no_imsorry.json'
-            if not os.path.isfile(eval_filename):
-                os.system(
-                    'wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/%s' % eval_filename)
-            import json
-            data = json.load(open(eval_filename, 'rt'))
-            # focus on data that starts with human, else likely chopped from other data
-            turn_start = 0  # odd in general
-            data = [x for x in data if len(x['conversations']) > turn_start + 1 and
-                    x['conversations'][turn_start]['from'] == 'human' and
-                    x['conversations'][turn_start + 1]['from'] == 'gpt']
-            np.random.seed(eval_sharegpt_prompts_only_seed)
-            example1 = examples[-1]  # pick reference example
-            examples = []
-            responses = []
-            for i in list(np.random.randint(0, len(data), size=eval_sharegpt_prompts_only)):
-                assert data[i]['conversations'][turn_start]['from'] == 'human'
-                instruction = data[i]['conversations'][turn_start]['value']
-                assert data[i]['conversations'][turn_start + 1]['from'] == 'gpt'
-                output = data[i]['conversations'][turn_start + 1]['value']
-                examplenew = example1.copy()
-                assert not chat, "No gradio must use chat=False, uses nochat instruct"
-                examplenew[eval_func_param_names.index('instruction_nochat')] = instruction
-                examplenew[eval_func_param_names.index('iinput_nochat')] = ''  # no input
-                examplenew[eval_func_param_names.index('context')] = get_context(chat_context, prompt_type)
-                examples.append(examplenew)
-                responses.append(output)
-        num_examples = len(examples)
-        scoring_path = 'scoring'
-        os.makedirs(scoring_path, exist_ok=True)
-        if eval_sharegpt_as_output:
-            used_base_model = 'gpt35'
-            used_lora_weights = ''
-        else:
-            used_base_model = str(base_model.split('/')[-1])
-            used_lora_weights = str(lora_weights.split('/')[-1])
-        eval_filename = "df_scores_%s_%s_%s_%s_%s_%s.parquet" % (num_examples, eval_sharegpt_prompts_only,
-                                                                 eval_sharegpt_prompts_only_seed,
-                                                                 eval_sharegpt_as_output,
-                                                                 used_base_model,
-                                                                 used_lora_weights)
-        eval_filename = os.path.join(scoring_path, eval_filename)
-        # torch.device("cuda") leads to cuda:x cuda:y mismatches for multi-GPU consistently
-        device = 'cpu' if n_gpus == 0 else 'cuda'
-        context_class = NullContext if n_gpus > 1 or n_gpus == 0 else torch.device
-        with context_class(device):
-            # ensure was set right above before examples generated
-            assert not stream_output, "stream_output=True does not make sense with example loop"
-            import time
-            from functools import partial
-            # get score model
-            smodel, stokenizer, sdevice = get_score_model(**locals())
-            if not eval_sharegpt_as_output:
-                model, tokenizer, device = get_model(**locals())
-                model_state = [model, tokenizer, device, base_model]
-                kwargs_evaluate = {k: v for k, v in locals().items() if k in inputs_kwargs_list}
-                my_db_state = [None]
-                fun = partial(evaluate, model_state, my_db_state, **kwargs_evaluate)
-            else:
-                assert eval_sharegpt_prompts_only > 0
-                def get_response(*args, exi=0):
-                    # assumes same ordering of examples and responses
-                    yield responses[exi]
-                fun = get_response
-            t0 = time.time()
-            score_dump = []
-            for exi, ex in enumerate(examples):
-                instruction = ex[eval_func_param_names.index('instruction_nochat')]
-                iinput = ex[eval_func_param_names.index('iinput_nochat')]
-                context = ex[eval_func_param_names.index('context')]
-                clear_torch_cache()
-                print("")
-                print("START" + "=" * 100)
-                print("Question: %s %s" % (instruction, ('input=%s' % iinput if iinput else '')))
-                print("-" * 105)
-                # fun yields as generator, so have to iterate over it
-                # Also means likely do NOT want --stream_output=True, else would show all generations
-                gener = fun(*tuple(ex), exi=exi) if eval_sharegpt_as_output else fun(*tuple(ex))
-                for res in gener:
-                    print(res)
-                    if smodel:
-                        score_with_prompt = False
-                        if score_with_prompt:
-                            data_point = dict(instruction=instruction, input=iinput, context=context)
-                            prompter = Prompter(prompt_type, debug=debug, chat=chat, stream_output=stream_output)
-                            prompt = prompter.generate_prompt(data_point)
-                        else:
-                            # just raw input and output
-                            if eval_sharegpt_prompts_only > 0:
-                                # only our own examples have this filled at moment
-                                assert iinput in [None, ''], iinput  # should be no iinput
-                            if not (chat_context and prompt_type == 'human_bot'):
-                                assert context in [None, ''], context  # should be no context
-                            prompt = instruction
-                        cutoff_len = 768 if is_low_mem else 2048
-                        inputs = stokenizer(prompt, res,
-                                            return_tensors="pt",
-                                            truncation=True,
-                                            max_length=cutoff_len)
-                        try:
-                            score = torch.sigmoid(smodel(**inputs).logits[0].float()).cpu().detach().numpy()[0]
-                        except torch.cuda.OutOfMemoryError as e:
-                            print("GPU OOM 1: question: %s answer: %s exception: %s" % (prompt, res, str(e)),
-                                  flush=True)
-                            traceback.print_exc()
-                            score = 0.0
-                            clear_torch_cache()
-                        except (Exception, RuntimeError) as e:
-                            if 'Expected all tensors to be on the same device' in str(e) or \
-                                    'expected scalar type Half but found Float' in str(e) or \
-                                    'probability tensor contains either' in str(e) or \
-                                    'cublasLt ran into an error!' in str(e):
-                                print("GPU error: question: %s answer: %s exception: %s" % (prompt, res, str(e)),
-                                      flush=True)
-                                traceback.print_exc()
-                                score = 0.0
-                                clear_torch_cache()
-                            else:
-                                raise
-                        print("SCORE %s: %s" % (exi, score), flush=True)
-                        score_dump.append(ex + [prompt, res, score])
-                        # dump every score in case abort
-                        df_scores = pd.DataFrame(score_dump,
-                                                 columns=eval_func_param_names + eval_extra_columns)
-                        df_scores.to_parquet(eval_filename, index=False)
-                        # plot histogram so far
-                        plt.figure(figsize=(10, 10))
-                        plt.hist(df_scores['score'], bins=20)
-                        score_avg = np.mean(df_scores['score'])
-                        score_median = np.median(df_scores['score'])
-                        plt.title("Score avg: %s median: %s" % (score_avg, score_median))
-                        plt.savefig(eval_filename.replace('.parquet', '.png'))
-                        plt.close()
-                print("END" + "=" * 102)
-                print("")
-                t2 = time.time()
-                print("Time taken so far: %.4f about %.4g per example" % (t2 - t0, (t2 - t0) / (1 + exi)))
-            t1 = time.time()
-            print("Total time taken: %.4f about %.4g per example" % (t1 - t0, (t1 - t0) / num_examples))
-        return eval_filename
-    if gradio:
         # imported here so don't require gradio to run generate
         from gradio_runner import go_gradio
         # get default model
         all_kwargs = locals().copy()
         if all_kwargs.get('base_model') and not all_kwargs['login_mode_if_model0']:
-            model0, tokenizer0, device = get_model(**all_kwargs)
         else:
             # if empty model, then don't load anything, just get gradio up
             model0, tokenizer0, device = None, None, None
         model_state0 = [model0, tokenizer0, device, all_kwargs['base_model']]
         # get score model
-        smodel, stokenizer, sdevice = get_score_model(**all_kwargs)
         score_model_state0 = [smodel, stokenizer, sdevice, score_model]
         if enable_captions:
@@ -546,6 +439,7 @@ def main(
         else:
             caption_loader = False
         go_gradio(**locals())
@@ -624,12 +518,15 @@ def get_non_lora_model(base_model, model_loader, load_half, model_kwargs, reward
     else:
         device_map = {'': 'cpu'}
         model_kwargs['load_in_8bit'] = False
     print('device_map: %s' % device_map, flush=True)
     load_in_8bit = model_kwargs.get('load_in_8bit', False)
     model_kwargs['device_map'] = device_map
-    if load_in_8bit or not load_half:
         model = model_loader.from_pretrained(
             base_model,
             config=config,
@@ -646,6 +543,7 @@ def get_non_lora_model(base_model, model_loader, load_half, model_kwargs, reward
 def get_model(
         load_8bit: bool = False,
         load_half: bool = True,
         infer_devices: bool = True,
         base_model: str = '',
@@ -659,12 +557,14 @@ def get_model(
         use_auth_token: Union[str, bool] = False,
         trust_remote_code: bool = True,
         offload_folder: str = None,
-        compile: bool = True,
-        **kwargs,
 ):
     """
     :param load_8bit: load model in 8-bit, not supported by all models
     :param load_half: load model in 16-bit
     :param infer_devices: Use torch infer of optimal placement of layers on devices (for non-lora case)
            For non-LORA case, False will spread shards across multiple GPUs, but this can lead to cuda:x cuda:y mismatches
@@ -679,26 +579,29 @@ def get_model(
     :param use_auth_token: assumes user did on CLI `huggingface-cli login` to access private repo
     :param trust_remote_code: trust code needed by model
     :param offload_folder: offload folder
-    :param compile: whether to compile torch model
-    :param kwargs:
     :return:
     """
-    print("Get %s model" % base_model, flush=True)
-    if base_model in ['llama', 'gptj']:
         from gpt4all_llm import get_model_tokenizer_gpt4all
         model, tokenizer, device = get_model_tokenizer_gpt4all(base_model)
         return model, tokenizer, device
     if lora_weights is not None and lora_weights.strip():
-        print("Get %s lora weights" % lora_weights, flush=True)
     device = get_device()
     if 'gpt2' in base_model.lower():
         # RuntimeError: where expected condition to be a boolean tensor, but got a tensor with dtype Half
         load_8bit = False
     assert base_model.strip(), (
-        "Please choose a base model with --base_model (CLI) or in Models Tab (gradio)"
     )
     from transformers import AutoConfig
@@ -709,8 +612,9 @@ def get_model(
     llama_type_from_name = "llama" in base_model.lower()
     llama_type = llama_type_from_config or llama_type_from_name
     if llama_type:
-        print("Detected as llama type from"
-              " config (%s) or name (%s)" % (llama_type_from_config, llama_type_from_name), flush=True)
     model_loader, tokenizer_loader = get_loaders(llama_type=llama_type, model_name=base_model, reward_type=reward_type)
     if not tokenizer_base_model:
@@ -744,7 +648,8 @@ def get_model(
                             )
         if 'mbart-' not in base_model.lower() and 'mpt-' not in base_model.lower():
             model_kwargs.update(dict(load_in_8bit=load_8bit,
-                                     device_map={"": 0} if load_8bit and device == 'cuda' else "auto",
                                      ))
         if 'mpt-' in base_model.lower() and gpu_id >= 0:
             model_kwargs.update(dict(device_map={"": gpu_id} if device == 'cuda' else "cpu"))
@@ -753,6 +658,7 @@ def get_model(
             # FIXME: could put on other GPUs
             model_kwargs['device_map'] = {"": 0} if device == 'cuda' else {"": 'cpu'}
             model_kwargs.pop('torch_dtype', None)
         if not lora_weights:
             with torch.device(device):
@@ -764,7 +670,7 @@ def get_model(
                                                offload_folder=offload_folder,
                                                )
                 else:
-                    if load_half and not load_8bit:
                         model = model_loader.from_pretrained(
                             base_model,
                             **model_kwargs).half()
@@ -772,7 +678,7 @@ def get_model(
                         model = model_loader.from_pretrained(
                             base_model,
                             **model_kwargs)
-        elif load_8bit:
             model = model_loader.from_pretrained(
                 base_model,
                 **model_kwargs
@@ -821,24 +727,62 @@ def get_model(
     if not isinstance(tokenizer, str):
         model.eval()
-        if torch.__version__ >= "2" and sys.platform != "win32" and compile:
             model = torch.compile(model)
     return model, tokenizer, device
-def get_score_model(**kwargs):
-    # score model
-    if kwargs.get('score_model') is not None and kwargs.get('score_model').strip():
-        score_all_kwargs = kwargs.copy()
-        score_all_kwargs['load_8bit'] = False
-        score_all_kwargs['load_half'] = False
-        score_all_kwargs['base_model'] = kwargs.get('score_model').strip()
-        score_all_kwargs['tokenizer_base_model'] = ''
-        score_all_kwargs['lora_weights'] = ''
-        score_all_kwargs['llama_type'] = False
-        score_all_kwargs['compile'] = False
-        smodel, stokenizer, sdevice = get_model(**score_all_kwargs)
     else:
         smodel, stokenizer, sdevice = None, None, None
     return smodel, stokenizer, sdevice
@@ -864,6 +808,7 @@ eval_func_param_names = ['instruction',
                          'instruction_nochat',
                          'iinput_nochat',
                          'langchain_mode',
                          'document_choice',
                          ]
@@ -892,6 +837,7 @@ def evaluate(
         instruction_nochat,
         iinput_nochat,
         langchain_mode,
         document_choice,
         # END NOTE: Examples must have same order of parameters
         src_lang=None,
@@ -901,27 +847,29 @@ def evaluate(
         save_dir=None,
         sanitize_bot_response=True,
         model_state0=None,
-        is_low_mem=None,
         raise_generate_gpu_exceptions=None,
         chat_context=None,
         lora_weights=None,
         load_db_if_exists=True,
         dbs=None,
         user_path=None,
         use_openai_embedding=None,
         use_openai_model=None,
         hf_embedding_model=None,
         chunk=None,
         chunk_size=None,
         db_type=None,
-        k=None,
         n_jobs=None,
         first_para=None,
         text_limit=None,
 ):
     # ensure passed these
     assert concurrency_count is not None
-    assert is_low_mem is not None
     assert raise_generate_gpu_exceptions is not None
     assert chat_context is not None
     assert use_openai_embedding is not None
@@ -930,7 +878,7 @@ def evaluate(
     assert chunk is not None
     assert chunk_size is not None
     assert db_type is not None
-    assert k is not None
     assert n_jobs is not None
     assert first_para is not None
@@ -940,7 +888,7 @@ def evaluate(
         locals_dict.pop('model_state0', None)
         print(locals_dict)
-    no_model_msg = "Please choose a base model with --base_model (CLI) or in Models Tab (gradio).\nThen start New Conversation"
     if model_state0 is None:
         # e.g. for no gradio case, set dummy value, else should be set
@@ -990,7 +938,7 @@ def evaluate(
         db1 = dbs[langchain_mode]
     else:
         db1 = None
-    if langchain_mode not in [False, 'Disabled', 'ChatLLM', 'LLM'] and db1 is not None or base_model in ['llama', 'gptj']:
         query = instruction if not iinput else "%s\n%s" % (instruction, iinput)
         outr = ""
         # use smaller cut_distanct for wiki_full since so many matches could be obtained, and often irrelevant unless close
@@ -1002,6 +950,7 @@ def evaluate(
                            load_db_if_exists=load_db_if_exists,
                            db=db1,
                            user_path=user_path,
                            max_new_tokens=max_new_tokens,
                            cut_distanct=1.1 if langchain_mode in ['wiki_full'] else 1.64,  # FIXME, too arbitrary
                            use_openai_embedding=use_openai_embedding,
@@ -1014,21 +963,28 @@ def evaluate(
                            langchain_mode=langchain_mode,
                            document_choice=document_choice,
                            db_type=db_type,
-                           k=k,
                            temperature=temperature,
                            repetition_penalty=repetition_penalty,
                            top_k=top_k,
                            top_p=top_p,
                            prompt_type=prompt_type,
                            n_jobs=n_jobs,
                            ):
-            outr = r  # doesn't accumulate, new answer every yield, so only save that full answer
-            yield r
         if save_dir:
             save_generate_output(output=outr, base_model=base_model, save_dir=save_dir)
-            print('Post-Generate Langchain: %s decoded_output: %s' % (str(datetime.now()), len(outr) if outr else -1),
-                  flush=True)
-        if outr:
             return
     if isinstance(tokenizer, str):
@@ -1038,7 +994,7 @@ def evaluate(
         else:
             raise RuntimeError("No such task type %s" % tokenizer)
         # NOTE: uses max_length only
-        yield model(prompt, max_length=max_new_tokens)[0][key]
     if 'mbart-' in base_model.lower():
         assert src_lang is not None
@@ -1048,7 +1004,7 @@ def evaluate(
         # override, ignore user change
         num_return_sequences = 1
     stopping_criteria = get_stopping(prompt_type, tokenizer, device)
-    _, _, max_length_tokenize, max_prompt_length = get_cutoffs(is_low_mem)
     prompt = prompt[-max_prompt_length:]
     inputs = tokenizer(prompt,
                        return_tensors="pt",
@@ -1059,6 +1015,10 @@ def evaluate(
     if debug and len(inputs["input_ids"]) > 0:
         print('input_ids length', len(inputs["input_ids"][0]), flush=True)
     input_ids = inputs["input_ids"].to(device)
     generation_config = GenerationConfig(
         temperature=float(temperature),
         top_p=float(top_p),
@@ -1111,10 +1071,12 @@ def evaluate(
             # https://github.com/h2oai/h2ogpt/issues/104
             # but only makes sense if concurrency_count == 1
             context_class = NullContext  # if concurrency_count > 1 else filelock.FileLock
-            print('Pre-Generate: %s' % str(datetime.now()), flush=True)
             decoded_output = None
             with context_class("generate.lock"):
-                print('Generate: %s' % str(datetime.now()), flush=True)
                 # decoded tokenized prompt can deviate from prompt due to special characters
                 inputs_decoded = decoder(input_ids[0])
                 inputs_decoded_raw = decoder_raw(input_ids[0])
@@ -1136,7 +1098,8 @@ def evaluate(
                     decoder = decoder_raw
                     decoder_kwargs = decoder_raw_kwargs
                 else:
-                    print("WARNING: Special characters in prompt", flush=True)
                 if stream_output:
                     skip_prompt = False
                     streamer = H2OTextIteratorStreamer(tokenizer, skip_prompt=skip_prompt, block=False,
@@ -1155,8 +1118,9 @@ def evaluate(
                             if bucket.qsize() > 0 or thread.exc:
                                 thread.join()
                             outputs += new_text
-                            yield prompter.get_response(outputs, prompt=inputs_decoded,
-                                                        sanitize_bot_response=sanitize_bot_response)
                     except BaseException:
                         # if any exception, raise that exception if was from thread, first
                         if thread.exc:
@@ -1173,14 +1137,15 @@ def evaluate(
                 else:
                     outputs = model.generate(**gen_kwargs)
                     outputs = [decoder(s) for s in outputs.sequences]
-                    yield prompter.get_response(outputs, prompt=inputs_decoded,
-                                                sanitize_bot_response=sanitize_bot_response)
                     if outputs and len(outputs) >= 1:
                         decoded_output = prompt + outputs[0]
                 if save_dir and decoded_output:
                     save_generate_output(output=decoded_output, base_model=base_model, save_dir=save_dir)
-            print('Post-Generate: %s decoded_output: %s' % (
-                str(datetime.now()), len(decoded_output) if decoded_output else -1), flush=True)
 inputs_list_names = list(inspect.signature(evaluate).parameters)
@@ -1188,12 +1153,15 @@ state_names = ['model_state', 'my_db_state']
 inputs_kwargs_list = [x for x in inputs_list_names if x not in eval_func_param_names + state_names]
-def get_cutoffs(is_low_mem, for_context=False):
     # help to avoid errors like:
     # RuntimeError: The size of tensor a (2048) must match the size of tensor b (2049) at non-singleton dimension 3
     # RuntimeError: expected scalar type Half but found Float
     # with - 256
-    max_length_tokenize = 768 - 256 if is_low_mem else 2048 - 256
     cutoff_len = max_length_tokenize * 4  # if reaches limit, then can't generate new tokens
     output_smallest = 30 * 4
     max_prompt_length = cutoff_len - output_smallest
@@ -1286,7 +1254,7 @@ def get_generate_params(model_lower, chat,
                         prompt_type, temperature, top_p, top_k, num_beams,
                         max_new_tokens, min_new_tokens, early_stopping, max_time,
                         repetition_penalty, num_return_sequences,
-                        do_sample):
     use_defaults = False
     use_default_examples = True
     examples = []
@@ -1303,7 +1271,8 @@ def get_generate_params(model_lower, chat,
     if not prompt_type and model_lower in inv_prompt_type_to_model_lower:
         prompt_type = inv_prompt_type_to_model_lower[model_lower]
-        print("Auto-selecting prompt_type=%s for %s" % (prompt_type, model_lower), flush=True)
     # examples at first don't include chat, instruction_nochat, iinput_nochat, added at end
     if show_examples is None:
@@ -1366,9 +1335,6 @@ Philipp: ok, ok you can find everything here. https://huggingface.co/blog/the-pa
             prompt_type = prompt_type or 'plain'
         else:
             prompt_type = ''
-        examples += [[summarize_example1, 'Summarize' if prompt_type not in ['plain', 'instruct_simple'] else '', "",
-                      stream_output, prompt_type or 'plain', 0.1, 0.75, 40, 4, 256, 0, False, max_time_defaults, 1.0, 1,
-                      False]]
         task_info = "No task"
         if prompt_type == 'instruct':
             task_info = "Answer question or follow imperative as instruction with optionally input."
@@ -1443,13 +1409,15 @@ y = np.random.randint(0, 1, 100)
 # fit random forest classifier with 20 estimators""", ''] + params_list,
         ]
     src_lang = "English"
     tgt_lang = "Russian"
     # move to correct position
     for example in examples:
-        example += [chat, '', '', 'Disabled', ['All']]
         # adjust examples if non-chat mode
         if not chat:
             example[eval_func_param_names.index('instruction_nochat')] = example[
@@ -1521,6 +1489,32 @@ def score_qa(smodel, stokenizer, max_length_tokenize, question, answer, cutoff_l
     return score
 if __name__ == "__main__":
     """
     Examples:

 import time
 import traceback
 import typing
+import warnings
 from datetime import datetime
 import filelock
 import psutil
+os.environ['HF_HUB_DISABLE_TELEMETRY'] = '1'
+os.environ['BITSANDBYTES_NOWELCOME'] = '1'
+warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
 from loaders import get_loaders
 from utils import set_seed, clear_torch_cache, save_generate_output, NullContext, wrapped_partial, EThread, get_githash, \
+    import_matplotlib, get_device, makedirs, get_kwargs
 import_matplotlib()
 SEED = 1236
 set_seed(SEED)
 from typing import Union
 import fire
 import torch
 from transformers import GenerationConfig, AutoModel, TextIteratorStreamer
 from accelerate import init_empty_weights, infer_auto_device_map
+from prompter import Prompter, inv_prompt_type_to_model_lower, non_hf_types
 from stopping import get_stopping
 eval_extra_columns = ['prompt', 'response', 'score']
 def main(
         load_8bit: bool = False,
+        load_4bit: bool = False,
         load_half: bool = True,
         infer_devices: bool = True,
         base_model: str = '',
         tokenizer_base_model: str = '',
         lora_weights: str = "",
         gpu_id: int = 0,
+        compile_model: bool = True,
         prompt_type: Union[int, str] = None,
         # input to generation
         early_stopping: Union[bool, str] = None,
         max_time: float = None,
+        memory_restriction_level: int = None,
         debug: bool = False,
         save_dir: str = None,
         share: bool = True,
         src_lang: str = "English",
         tgt_lang: str = "Russian",
+        cli: bool = False,
+        cli_loop: bool = True,
         gradio: bool = True,
         gradio_avoid_processing_markdown: bool = False,
+        gradio_offline_level: int = 0,
         chat: bool = True,
         chat_context: bool = False,
         stream_output: bool = True,
         show_examples: bool = None,
         verbose: bool = False,
+        h2ocolors: bool = False,
+        height: int = 600,
         show_lora: bool = True,
         login_mode_if_model0: bool = False,
         block_gradio_exit: bool = True,
         score_model: str = 'OpenAssistant/reward-model-deberta-v3-large-v2',
         auto_score: bool = True,
+        eval_filename: str = None,
+        eval_prompts_only_num: int = 0,
+        eval_prompts_only_seed: int = 1234,
+        eval_as_output: bool = False,
         langchain_mode: str = 'Disabled',
         visible_langchain_modes: list = ['UserData', 'MyData'],
+        document_choice: list = ['All'],
         user_path: str = None,
+        detect_user_path_changes_every_query: bool = False,
         load_db_if_exists: bool = True,
         keep_sources_in_context: bool = False,
         db_type: str = 'chroma',
         enable_sources_list: bool = True,
         chunk: bool = True,
         chunk_size: int = 512,
+        top_k_docs: int = 3,  # FIXME: Can go back to 4 once https://github.com/h2oai/h2ogpt/issues/192 fixed
         n_jobs: int = -1,
         enable_captions: bool = True,
         captions_model: str = "Salesforce/blip-image-captioning-base",
     """
     :param load_8bit: load model in 8-bit using bitsandbytes
+    :param load_4bit: load model in 4-bit using bitsandbytes
     :param load_half: load model in float16
     :param infer_devices: whether to control devices with gpu_id.  If False, then spread across GPUs
+    :param base_model: model HF-type name.  If use --base_model to preload model, cannot unload in gradio in models tab
+    :param tokenizer_base_model: tokenizer HF-type name.  Usually not required, inferred from base_model.
     :param lora_weights: LORA weights path/HF link
     :param gpu_id: if infer_devices, then use gpu_id for cuda device ID, or auto mode if gpu_id != -1
+    :param compile_model Whether to compile the model
     :param prompt_type: type of prompt, usually matched to fine-tuned model or plain for foundational model
     :param temperature: generation temperature
     :param top_p: generation top_p
     :param min_new_tokens: generation min tokens
     :param early_stopping: generation early stopping
     :param max_time: maximum time to allow for generation
+    :param memory_restriction_level: 0 = no restriction to tokens or model, 1 = some restrictions on token 2 = HF like restriction 3 = very low memory case
     :param debug: enable debug mode
     :param save_dir: directory chat data is saved to
     :param share: whether to share the gradio app with sharable URL
     :param offload_folder: path for spilling model onto disk
     :param src_lang: source languages to include if doing translation (None = all)
     :param tgt_lang: target languages to include if doing translation (None = all)
+    :param cli: whether to use CLI (non-gradio) interface.
+    :param cli_loop: whether to loop for CLI (False usually only for testing)
     :param gradio: whether to enable gradio, or to enable benchmark mode
     :param gradio_avoid_processing_markdown:
+    :param gradio_offline_level: > 0, then change fonts so full offline
+           == 1 means backend won't need internet for fonts, but front-end UI might if font not cached
+           == 2 means backend and frontend don't need internet to download any fonts.
+           Note: Some things always disabled include HF telemetry, gradio telemetry, chromadb posthog that involve uploading.
+           This option further disables google fonts for downloading, which is less intrusive than uploading,
+           but still required in air-gapped case.  The fonts don't look as nice as google fonts, but ensure full offline behavior.
     :param chat: whether to enable chat mode with chat history
     :param chat_context: whether to use extra helpful context if human_bot
     :param stream_output: whether to stream output from generate
     :param extra_lora_options: extra LORA to show in list in gradio
     :param score_model: which model to score responses (None means no scoring)
     :param auto_score: whether to automatically score responses
+    :param eval_filename: json file to use for evaluation, if None is sharegpt
+    :param eval_prompts_only_num: for no gradio benchmark, if using eval_filename prompts for eval instead of examples
+    :param eval_prompts_only_seed: for no gradio benchmark, seed for eval_filename sampling
+    :param eval_as_output: for no gradio benchmark, whether to test eval_filename output itself
     :param langchain_mode: Data source to include.  Choose "UserData" to only consume files from make_db.py.
            WARNING: wiki_full requires extra data processing via read_wiki_full.py and requires really good workstation to generate db, unless already present.
+    :param user_path: user path to glob from to generate db for vector search, for 'UserData' langchain mode.
+           If already have db, any new/changed files are added automatically if path set, does not have to be same path used for prior db sources
+    :param detect_user_path_changes_every_query: whether to detect if any files changed or added every similarity search (by file hashes).
+           Expensive for large number of files, so not done by default.  By default only detect changes during db loading.
     :param visible_langchain_modes: dbs to generate at launch to be ready for LLM
            Can be up to ['wiki', 'wiki_full', 'UserData', 'MyData', 'github h2oGPT', 'DriverlessAI docs']
            But wiki_full is expensive and requires preparation
            To allow scratch space only live in session, add 'MyData' to list
            Default: If only want to consume local files, e.g. prepared by make_db.py, only include ['UserData']
            FIXME: Avoid 'All' for now, not implemented
+    :param document_choice: Default document choice when taking subset of collection
     :param load_db_if_exists: Whether to load chroma db if exists or re-generate db
     :param keep_sources_in_context: Whether to keep url sources in context, not helpful usually
+    :param db_type: 'faiss' for in-memory or 'chroma' or 'weaviate' for persisted on disk
     :param use_openai_embedding: Whether to use OpenAI embeddings for vector db
     :param use_openai_model: Whether to use OpenAI model for use with vector db
     :param hf_embedding_model: Which HF embedding model to use for vector db
     :param allow_upload_to_user_data: Whether to allow file uploads to update shared vector db
     :param allow_upload_to_my_data: Whether to allow file uploads to update scratch vector db
     :param enable_url_upload: Whether to allow upload from URL
+    :param enable_text_upload: Whether to allow upload of text
     :param enable_sources_list: Whether to allow list (or download for non-shared db) of list of sources for chosen db
     :param chunk: Whether to chunk data (True unless know data is already optimally chunked)
     :param chunk_size: Size of chunks, with typically top-4 passed to LLM, so neesd to be in context length
+    :param top_k_docs: number of chunks to give LLM
     :param n_jobs: Number of processors to use when consuming documents (-1 = all, is default)
     :param enable_captions: Whether to support captions using BLIP for image files as documents, then preloads that model
     :param captions_model: Which model to use for captions.
     is_hf = bool(os.getenv("HUGGINGFACE_SPACES"))
     is_gpth2oai = bool(os.getenv("GPT_H2O_AI"))
     is_public = is_hf or is_gpth2oai  # multi-user case with fixed model and disclaimer
+    if memory_restriction_level is None:
+        memory_restriction_level = 2 if is_hf else 0  # 2 assumes run on 24GB consumer GPU
+    else:
+        assert 0 <= memory_restriction_level <= 3, "Bad memory_restriction_level=%s" % memory_restriction_level
     admin_pass = os.getenv("ADMIN_PASS")
     # will sometimes appear in UI or sometimes actual generation, but maybe better than empty result
     # but becomes unrecoverable sometimes if raise, so just be silent for now
             # by default don't sample, too chatty
             do_sample = False if do_sample is None else do_sample
+        if memory_restriction_level == 2:
             if not base_model:
                 base_model = 'h2oai/h2ogpt-oasst1-512-12b'
                 # don't set load_8bit if passed base_model, doesn't always work so can't just override
                 load_8bit = True
+                load_4bit = False  # FIXME - consider using 4-bit instead of 8-bit
         else:
             base_model = 'h2oai/h2ogpt-oasst1-512-20b' if not base_model else base_model
+    if memory_restriction_level >= 2:
         load_8bit = True
+        load_4bit = False  # FIXME - consider using 4-bit instead of 8-bit
     if is_hf:
         # must override share if in spaces
         share = False
     save_dir = os.getenv('SAVE_DIR', save_dir)
     score_model = os.getenv('SCORE_MODEL', score_model)
+    if score_model == 'None' or score_model is None:
         score_model = ''
     concurrency_count = int(os.getenv('CONCURRENCY_COUNT', concurrency_count))
     api_open = bool(int(os.getenv('API_OPEN', api_open)))
     if n_gpus == 0:
         gpu_id = None
         load_8bit = False
+        load_4bit = False
         load_half = False
         infer_devices = False
         torch.backends.cudnn.benchmark = True
                             max_new_tokens, min_new_tokens, early_stopping, max_time,
                             repetition_penalty, num_return_sequences,
                             do_sample,
+                            top_k_docs,
+                            verbose,
                             )
     locals_dict = locals()
     locals_print = '\n'.join(['%s: %s' % (k, v) for k, v in locals_dict.items()])
+    if verbose:
+        print(f"Generating model with params:\n{locals_print}", flush=True)
+        print("Command: %s\nHash: %s" % (str(' '.join(sys.argv)), get_githash()), flush=True)
     if langchain_mode != "Disabled":
         # SECOND PLACE where LangChain referenced, but all imports are kept local so not required
                 # FIXME: All should be avoided until scans over each db, shouldn't be separate db
                 continue
             persist_directory1 = 'db_dir_%s' % langchain_mode1  # single place, no special names for each case
+            db = prep_langchain(persist_directory1,
+                                load_db_if_exists,
+                                db_type, use_openai_embedding,
                                 langchain_mode1, user_path,
                                 hf_embedding_model,
                                 kwargs_make_db=locals())
             assert 'gpt_langchain' not in sys.modules, "Dev bug, import of langchain when should not have"
             assert 'langchain' not in sys.modules, "Dev bug, import of langchain when should not have"
+    if cli:
+        from cli import run_cli
+        return run_cli(**get_kwargs(run_cli, exclude_names=['model_state0'], **locals()))
+    elif not gradio:
+        from eval import run_eval
+        return run_eval(**get_kwargs(run_eval, exclude_names=['model_state0'], **locals()))
+    elif gradio:
         # imported here so don't require gradio to run generate
         from gradio_runner import go_gradio
         # get default model
         all_kwargs = locals().copy()
         if all_kwargs.get('base_model') and not all_kwargs['login_mode_if_model0']:
+            model0, tokenizer0, device = get_model(reward_type=False,
+                                                   **get_kwargs(get_model, exclude_names=['reward_type'], **all_kwargs))
         else:
             # if empty model, then don't load anything, just get gradio up
             model0, tokenizer0, device = None, None, None
         model_state0 = [model0, tokenizer0, device, all_kwargs['base_model']]
         # get score model
+        smodel, stokenizer, sdevice = get_score_model(reward_type=True,
+                                                      **get_kwargs(get_score_model, exclude_names=['reward_type'],
+                                                                   **all_kwargs))
         score_model_state0 = [smodel, stokenizer, sdevice, score_model]
         if enable_captions:
         else:
             caption_loader = False
+        # assume gradio needs everything
         go_gradio(**locals())
     else:
         device_map = {'': 'cpu'}
         model_kwargs['load_in_8bit'] = False
+        model_kwargs['load_in_4bit'] = False
     print('device_map: %s' % device_map, flush=True)
     load_in_8bit = model_kwargs.get('load_in_8bit', False)
+    load_in_4bit = model_kwargs.get('load_in_4bit', False)
     model_kwargs['device_map'] = device_map
+    pop_unused_model_kwargs(model_kwargs)
+    if load_in_8bit or load_in_4bit or not load_half:
         model = model_loader.from_pretrained(
             base_model,
             config=config,
 def get_model(
         load_8bit: bool = False,
+        load_4bit: bool = False,
         load_half: bool = True,
         infer_devices: bool = True,
         base_model: str = '',
         use_auth_token: Union[str, bool] = False,
         trust_remote_code: bool = True,
         offload_folder: str = None,
+        compile_model: bool = True,
+        verbose: bool = False,
 ):
     """
     :param load_8bit: load model in 8-bit, not supported by all models
+    :param load_4bit: load model in 4-bit, not supported by all models
     :param load_half: load model in 16-bit
     :param infer_devices: Use torch infer of optimal placement of layers on devices (for non-lora case)
            For non-LORA case, False will spread shards across multiple GPUs, but this can lead to cuda:x cuda:y mismatches
     :param use_auth_token: assumes user did on CLI `huggingface-cli login` to access private repo
     :param trust_remote_code: trust code needed by model
     :param offload_folder: offload folder
+    :param compile_model: whether to compile torch model
+    :param verbose:
     :return:
     """
+    if verbose:
+        print("Get %s model" % base_model, flush=True)
+    if base_model in non_hf_types:
         from gpt4all_llm import get_model_tokenizer_gpt4all
         model, tokenizer, device = get_model_tokenizer_gpt4all(base_model)
         return model, tokenizer, device
     if lora_weights is not None and lora_weights.strip():
+        if verbose:
+            print("Get %s lora weights" % lora_weights, flush=True)
     device = get_device()
     if 'gpt2' in base_model.lower():
         # RuntimeError: where expected condition to be a boolean tensor, but got a tensor with dtype Half
         load_8bit = False
+        load_4bit = False
     assert base_model.strip(), (
+        "Please choose a base model with --base_model (CLI) or load one from Models Tab (gradio)"
     )
     from transformers import AutoConfig
     llama_type_from_name = "llama" in base_model.lower()
     llama_type = llama_type_from_config or llama_type_from_name
     if llama_type:
+        if verbose:
+            print("Detected as llama type from"
+                  " config (%s) or name (%s)" % (llama_type_from_config, llama_type_from_name), flush=True)
     model_loader, tokenizer_loader = get_loaders(llama_type=llama_type, model_name=base_model, reward_type=reward_type)
     if not tokenizer_base_model:
                             )
         if 'mbart-' not in base_model.lower() and 'mpt-' not in base_model.lower():
             model_kwargs.update(dict(load_in_8bit=load_8bit,
+                                     load_in_4bit=load_4bit,
+                                     device_map={"": 0} if (load_8bit or load_4bit) and device == 'cuda' else "auto",
                                      ))
         if 'mpt-' in base_model.lower() and gpu_id >= 0:
             model_kwargs.update(dict(device_map={"": gpu_id} if device == 'cuda' else "cpu"))
             # FIXME: could put on other GPUs
             model_kwargs['device_map'] = {"": 0} if device == 'cuda' else {"": 'cpu'}
             model_kwargs.pop('torch_dtype', None)
+        pop_unused_model_kwargs(model_kwargs)
         if not lora_weights:
             with torch.device(device):
                                                offload_folder=offload_folder,
                                                )
                 else:
+                    if load_half and not (load_8bit or load_4bit):
                         model = model_loader.from_pretrained(
                             base_model,
                             **model_kwargs).half()
                         model = model_loader.from_pretrained(
                             base_model,
                             **model_kwargs)
+        elif load_8bit or load_4bit:
             model = model_loader.from_pretrained(
                 base_model,
                 **model_kwargs
     if not isinstance(tokenizer, str):
         model.eval()
+        if torch.__version__ >= "2" and sys.platform != "win32" and compile_model:
             model = torch.compile(model)
+    if hasattr(config, 'max_position_embeddings') and isinstance(config.max_position_embeddings, int):
+        # help automatically limit inputs to generate
+        tokenizer.model_max_length = config.max_position_embeddings
+    else:
+        tokenizer.model_max_length = 2048
     return model, tokenizer, device
+def pop_unused_model_kwargs(model_kwargs):
+    """
+    in-place pop unused kwargs that are not dependency-upgrade friendly
+    no point passing in False, is default, and helps avoid needing to update requirements for new deps
+    :param model_kwargs:
+    :return:
+    """
+    check_list = ['load_in_8bit', 'load_in_4bit']
+    for k in check_list:
+        if k in model_kwargs and not model_kwargs[k]:
+            model_kwargs.pop(k)
+def get_score_model(score_model: str = None,
+                    load_8bit: bool = False,
+                    load_4bit: bool = False,
+                    load_half: bool = True,
+                    infer_devices: bool = True,
+                    base_model: str = '',
+                    tokenizer_base_model: str = '',
+                    lora_weights: str = "",
+                    gpu_id: int = 0,
+                    reward_type: bool = None,
+                    local_files_only: bool = False,
+                    resume_download: bool = True,
+                    use_auth_token: Union[str, bool] = False,
+                    trust_remote_code: bool = True,
+                    offload_folder: str = None,
+                    compile_model: bool = True,
+                    verbose: bool = False,
+                    ):
+    if score_model is not None and score_model.strip():
+        load_8bit = False
+        load_4bit = False
+        load_half = False
+        base_model = score_model.strip()
+        tokenizer_base_model = ''
+        lora_weights = ''
+        llama_type = False
+        compile_model = False
+        smodel, stokenizer, sdevice = get_model(reward_type=True,
+                                                **get_kwargs(get_model, exclude_names=['reward_type'], **locals()))
     else:
         smodel, stokenizer, sdevice = None, None, None
     return smodel, stokenizer, sdevice
                          'instruction_nochat',
                          'iinput_nochat',
                          'langchain_mode',
+                         'top_k_docs',
                          'document_choice',
                          ]
         instruction_nochat,
         iinput_nochat,
         langchain_mode,
+        top_k_docs,
         document_choice,
         # END NOTE: Examples must have same order of parameters
         src_lang=None,
         save_dir=None,
         sanitize_bot_response=True,
         model_state0=None,
+        memory_restriction_level=None,
         raise_generate_gpu_exceptions=None,
         chat_context=None,
         lora_weights=None,
         load_db_if_exists=True,
         dbs=None,
         user_path=None,
+        detect_user_path_changes_every_query=None,
         use_openai_embedding=None,
         use_openai_model=None,
         hf_embedding_model=None,
         chunk=None,
         chunk_size=None,
         db_type=None,
         n_jobs=None,
         first_para=None,
         text_limit=None,
+        verbose=False,
+        cli=False,
 ):
     # ensure passed these
     assert concurrency_count is not None
+    assert memory_restriction_level is not None
     assert raise_generate_gpu_exceptions is not None
     assert chat_context is not None
     assert use_openai_embedding is not None
     assert chunk is not None
     assert chunk_size is not None
     assert db_type is not None
+    assert top_k_docs is not None and isinstance(top_k_docs, int)
     assert n_jobs is not None
     assert first_para is not None
         locals_dict.pop('model_state0', None)
         print(locals_dict)
+    no_model_msg = "Please choose a base model with --base_model (CLI) or load in Models Tab (gradio).\nThen start New Conversation"
     if model_state0 is None:
         # e.g. for no gradio case, set dummy value, else should be set
         db1 = dbs[langchain_mode]
     else:
         db1 = None
+    if langchain_mode not in [False, 'Disabled', 'ChatLLM', 'LLM'] and db1 is not None or base_model in non_hf_types:
         query = instruction if not iinput else "%s\n%s" % (instruction, iinput)
         outr = ""
         # use smaller cut_distanct for wiki_full since so many matches could be obtained, and often irrelevant unless close
                            load_db_if_exists=load_db_if_exists,
                            db=db1,
                            user_path=user_path,
+                           detect_user_path_changes_every_query=detect_user_path_changes_every_query,
                            max_new_tokens=max_new_tokens,
                            cut_distanct=1.1 if langchain_mode in ['wiki_full'] else 1.64,  # FIXME, too arbitrary
                            use_openai_embedding=use_openai_embedding,
                            langchain_mode=langchain_mode,
                            document_choice=document_choice,
                            db_type=db_type,
+                           k=top_k_docs,
                            temperature=temperature,
                            repetition_penalty=repetition_penalty,
                            top_k=top_k,
                            top_p=top_p,
                            prompt_type=prompt_type,
                            n_jobs=n_jobs,
+                           verbose=verbose,
+                           cli=cli,
                            ):
+            outr, extra = r  # doesn't accumulate, new answer every yield, so only save that full answer
+            yield dict(response=outr, sources=extra)
         if save_dir:
             save_generate_output(output=outr, base_model=base_model, save_dir=save_dir)
+            if verbose:
+                print(
+                    'Post-Generate Langchain: %s decoded_output: %s' % (str(datetime.now()), len(outr) if outr else -1),
+                    flush=True)
+        if outr or base_model in non_hf_types:
+            # if got no response (e.g. not showing sources and got no sources,
+            # so nothing to give to LLM), then slip through and ask LLM
+            # Or if llama/gptj, then just return since they had no response and can't go down below code path
             return
     if isinstance(tokenizer, str):
         else:
             raise RuntimeError("No such task type %s" % tokenizer)
         # NOTE: uses max_length only
+        yield dict(response=model(prompt, max_length=max_new_tokens)[0][key], sources='')
     if 'mbart-' in base_model.lower():
         assert src_lang is not None
         # override, ignore user change
         num_return_sequences = 1
     stopping_criteria = get_stopping(prompt_type, tokenizer, device)
+    _, _, max_length_tokenize, max_prompt_length = get_cutoffs(memory_restriction_level, model_max_length=tokenizer.model_max_length)
     prompt = prompt[-max_prompt_length:]
     inputs = tokenizer(prompt,
                        return_tensors="pt",
     if debug and len(inputs["input_ids"]) > 0:
         print('input_ids length', len(inputs["input_ids"][0]), flush=True)
     input_ids = inputs["input_ids"].to(device)
+    # CRITICAL LIMIT else will fail
+    max_max_tokens = tokenizer.model_max_length
+    max_input_tokens = max_max_tokens - max_new_tokens
+    input_ids = input_ids[:, -max_input_tokens:]
     generation_config = GenerationConfig(
         temperature=float(temperature),
         top_p=float(top_p),
             # https://github.com/h2oai/h2ogpt/issues/104
             # but only makes sense if concurrency_count == 1
             context_class = NullContext  # if concurrency_count > 1 else filelock.FileLock
+            if verbose:
+                print('Pre-Generate: %s' % str(datetime.now()), flush=True)
             decoded_output = None
             with context_class("generate.lock"):
+                if verbose:
+                    print('Generate: %s' % str(datetime.now()), flush=True)
                 # decoded tokenized prompt can deviate from prompt due to special characters
                 inputs_decoded = decoder(input_ids[0])
                 inputs_decoded_raw = decoder_raw(input_ids[0])
                     decoder = decoder_raw
                     decoder_kwargs = decoder_raw_kwargs
                 else:
+                    if verbose:
+                        print("WARNING: Special characters in prompt", flush=True)
                 if stream_output:
                     skip_prompt = False
                     streamer = H2OTextIteratorStreamer(tokenizer, skip_prompt=skip_prompt, block=False,
                             if bucket.qsize() > 0 or thread.exc:
                                 thread.join()
                             outputs += new_text
+                            yield dict(response=prompter.get_response(outputs, prompt=inputs_decoded,
+                                                                      sanitize_bot_response=sanitize_bot_response),
+                                       sources='')
                     except BaseException:
                         # if any exception, raise that exception if was from thread, first
                         if thread.exc:
                 else:
                     outputs = model.generate(**gen_kwargs)
                     outputs = [decoder(s) for s in outputs.sequences]
+                    yield dict(response=prompter.get_response(outputs, prompt=inputs_decoded,
+                                                              sanitize_bot_response=sanitize_bot_response), sources='')
                     if outputs and len(outputs) >= 1:
                         decoded_output = prompt + outputs[0]
                 if save_dir and decoded_output:
                     save_generate_output(output=decoded_output, base_model=base_model, save_dir=save_dir)
+            if verbose:
+                print('Post-Generate: %s decoded_output: %s' % (
+                    str(datetime.now()), len(decoded_output) if decoded_output else -1), flush=True)
 inputs_list_names = list(inspect.signature(evaluate).parameters)
 inputs_kwargs_list = [x for x in inputs_list_names if x not in eval_func_param_names + state_names]
+def get_cutoffs(memory_restriction_level, for_context=False, model_max_length=2048):
     # help to avoid errors like:
     # RuntimeError: The size of tensor a (2048) must match the size of tensor b (2049) at non-singleton dimension 3
     # RuntimeError: expected scalar type Half but found Float
     # with - 256
+    if memory_restriction_level > 0:
+        max_length_tokenize = 768 - 256 if memory_restriction_level <= 2 else 512 - 256
+    else:
+        max_length_tokenize = model_max_length - 256
     cutoff_len = max_length_tokenize * 4  # if reaches limit, then can't generate new tokens
     output_smallest = 30 * 4
     max_prompt_length = cutoff_len - output_smallest
                         prompt_type, temperature, top_p, top_k, num_beams,
                         max_new_tokens, min_new_tokens, early_stopping, max_time,
                         repetition_penalty, num_return_sequences,
+                        do_sample, k, verbose):
     use_defaults = False
     use_default_examples = True
     examples = []
     if not prompt_type and model_lower in inv_prompt_type_to_model_lower:
         prompt_type = inv_prompt_type_to_model_lower[model_lower]
+        if verbose:
+            print("Auto-selecting prompt_type=%s for %s" % (prompt_type, model_lower), flush=True)
     # examples at first don't include chat, instruction_nochat, iinput_nochat, added at end
     if show_examples is None:
             prompt_type = prompt_type or 'plain'
         else:
             prompt_type = ''
         task_info = "No task"
         if prompt_type == 'instruct':
             task_info = "Answer question or follow imperative as instruction with optionally input."
 # fit random forest classifier with 20 estimators""", ''] + params_list,
         ]
+    # add summary example
+    examples += [[summarize_example1, 'Summarize' if prompt_type not in ['plain', 'instruct_simple'] else ''] + params_list]
     src_lang = "English"
     tgt_lang = "Russian"
     # move to correct position
     for example in examples:
+        example += [chat, '', '', 'Disabled', k, ['All']]
         # adjust examples if non-chat mode
         if not chat:
             example[eval_func_param_names.index('instruction_nochat')] = example[
     return score
+def check_locals(**kwargs):
+    # ensure everything in evaluate is here
+    can_skip_because_locally_generated = [  # evaluate
+        'instruction',
+        'iinput',
+        'context',
+        'instruction_nochat',
+        'iinput_nochat',
+        # get_model:
+        'reward_type'
+    ]
+    for k in eval_func_param_names:
+        if k in can_skip_because_locally_generated:
+            continue
+        assert k in kwargs, "Missing %s" % k
+    for k in inputs_kwargs_list:
+        if k in can_skip_because_locally_generated:
+            continue
+        assert k in kwargs, "Missing %s" % k
+    for k in list(inspect.signature(get_model).parameters):
+        if k in can_skip_because_locally_generated:
+            continue
+        assert k in kwargs, "Missing %s" % k
 if __name__ == "__main__":
     """
     Examples:

gpt4all_llm.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import inspect
 import os
 from typing import Dict, Any, Optional, List
 from langchain.callbacks.manager import CallbackManagerForLLMRun
 from pydantic import root_validator
@@ -21,11 +22,11 @@ class FakeTokenizer:
 def get_model_tokenizer_gpt4all(base_model, **kwargs):
     # defaults (some of these are generation parameters, so need to be passed in at generation time)
-    model_kwargs = dict(n_ctx=kwargs.get('max_new_tokens', 256),
-                        n_threads=os.cpu_count() // 2,
                         temp=kwargs.get('temperature', 0.2),
                         top_p=kwargs.get('top_p', 0.75),
-                        top_k=kwargs.get('top_k', 40))
     env_gpt4all_file = ".env_gpt4all"
     model_kwargs.update(dotenv_values(env_gpt4all_file))
@@ -33,43 +34,103 @@ def get_model_tokenizer_gpt4all(base_model, **kwargs):
         if 'model_path_llama' not in model_kwargs:
             raise ValueError("No model_path_llama in %s" % env_gpt4all_file)
         model_path = model_kwargs.pop('model_path_llama')
         from gpt4all import GPT4All as GPT4AllModel
-    elif base_model == "gptj":
-        if 'model_path_gptj' not in model_kwargs:
-            raise ValueError("No model_path_gptj in %s" % env_gpt4all_file)
-        model_path = model_kwargs.pop('model_path_gptj')
         from gpt4all import GPT4All as GPT4AllModel
     else:
         raise ValueError("No such base_model %s" % base_model)
-    func_names = list(inspect.signature(GPT4AllModel).parameters)
-    model_kwargs = {k: v for k, v in model_kwargs.items() if k in func_names}
-    model = GPT4AllModel(model_path, **model_kwargs)
     return model, FakeTokenizer(), 'cpu'
-def get_llm_gpt4all(model_name, model=None,
                     max_new_tokens=256,
                     temperature=0.1,
                     repetition_penalty=1.0,
                     top_k=40,
-                    top_p=0.7):
     env_gpt4all_file = ".env_gpt4all"
-    model_kwargs = dotenv_values(env_gpt4all_file)
-    from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
-    callbacks = [StreamingStdOutCallbackHandler()]
-    n_ctx = model_kwargs.pop('n_ctx', 1024)
-    default_params = {'context_erase': 0.5, 'n_batch': 1, 'n_ctx': n_ctx, 'n_predict': max_new_tokens,
-                      'repeat_last_n': 64 if repetition_penalty != 1.0 else 0, 'repeat_penalty': repetition_penalty,
-                      'temp': temperature, 'top_k': top_k, 'top_p': top_p}
     if model_name == 'llama':
-        from langchain.llms import LlamaCpp
-        model_path = model_kwargs.pop('model_path_llama') if model is None else model
-        llm = LlamaCpp(model_path=model_path, n_ctx=n_ctx, callbacks=callbacks, verbose=False)
     else:
-        model_path = model_kwargs.pop('model_path_gptj') if model is None else model
-        llm = H2OGPT4All(model=model_path, backend='gptj', callbacks=callbacks,
-                         verbose=False, **default_params,
-                         )
     return llm
@@ -117,3 +178,78 @@ class H2OGPT4All(gpt4all.GPT4All):
         if verbose:
             print("_call prompt: %s" % prompt, flush=True)
         return super()._call(prompt, stop=stop, run_manager=run_manager)

 import inspect
 import os
+import sys
 from typing import Dict, Any, Optional, List
 from langchain.callbacks.manager import CallbackManagerForLLMRun
 from pydantic import root_validator
 def get_model_tokenizer_gpt4all(base_model, **kwargs):
     # defaults (some of these are generation parameters, so need to be passed in at generation time)
+    model_kwargs = dict(n_threads=os.cpu_count() // 2,
                         temp=kwargs.get('temperature', 0.2),
                         top_p=kwargs.get('top_p', 0.75),
+                        top_k=kwargs.get('top_k', 40),
+                        n_ctx=2048 - 256)
     env_gpt4all_file = ".env_gpt4all"
     model_kwargs.update(dotenv_values(env_gpt4all_file))
         if 'model_path_llama' not in model_kwargs:
             raise ValueError("No model_path_llama in %s" % env_gpt4all_file)
         model_path = model_kwargs.pop('model_path_llama')
+        # FIXME: GPT4All version of llama doesn't handle new quantization, so use llama_cpp_python
+        from llama_cpp import Llama
+        # llama sets some things at init model time, not generation time
+        func_names = list(inspect.signature(Llama.__init__).parameters)
+        model_kwargs = {k: v for k, v in model_kwargs.items() if k in func_names}
+        model_kwargs['n_ctx'] = int(model_kwargs['n_ctx'])
+        model = Llama(model_path=model_path, **model_kwargs)
+    elif base_model in "gpt4all_llama":
+        if 'model_name_gpt4all_llama' not in model_kwargs and 'model_path_gpt4all_llama' not in model_kwargs:
+            raise ValueError("No model_name_gpt4all_llama or model_path_gpt4all_llama in %s" % env_gpt4all_file)
+        model_name = model_kwargs.pop('model_name_gpt4all_llama')
+        model_type = 'llama'
         from gpt4all import GPT4All as GPT4AllModel
+        model = GPT4AllModel(model_name=model_name, model_type=model_type)
+    elif base_model in "gptj":
+        if 'model_name_gptj' not in model_kwargs and 'model_path_gptj' not in model_kwargs:
+            raise ValueError("No model_name_gpt4j or model_path_gpt4j in %s" % env_gpt4all_file)
+        model_name = model_kwargs.pop('model_name_gptj')
+        model_type = 'gptj'
         from gpt4all import GPT4All as GPT4AllModel
+        model = GPT4AllModel(model_name=model_name, model_type=model_type)
     else:
         raise ValueError("No such base_model %s" % base_model)
     return model, FakeTokenizer(), 'cpu'
+from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
+class H2OStreamingStdOutCallbackHandler(StreamingStdOutCallbackHandler):
+    def on_llm_new_token(self, token: str, **kwargs: Any) -> None:
+        """Run on new LLM token. Only available when streaming is enabled."""
+        # streaming to std already occurs without this
+        # sys.stdout.write(token)
+        # sys.stdout.flush()
+        pass
+def get_model_kwargs(env_kwargs, default_kwargs, cls):
+    # default from class
+    model_kwargs = {k: v.default for k, v in dict(inspect.signature(cls).parameters).items()}
+    # from our defaults
+    model_kwargs.update(default_kwargs)
+    # from user defaults
+    model_kwargs.update(env_kwargs)
+    # ensure only valid keys
+    func_names = list(inspect.signature(cls).parameters)
+    model_kwargs = {k: v for k, v in model_kwargs.items() if k in func_names}
+    return model_kwargs
+def get_llm_gpt4all(model_name,
+                    model=None,
                     max_new_tokens=256,
                     temperature=0.1,
                     repetition_penalty=1.0,
                     top_k=40,
+                    top_p=0.7,
+                    verbose=False):
     env_gpt4all_file = ".env_gpt4all"
+    env_kwargs = dotenv_values(env_gpt4all_file)
+    callbacks = [H2OStreamingStdOutCallbackHandler()]
+    n_ctx = env_kwargs.pop('n_ctx', 2048 - max_new_tokens)
+    default_kwargs = dict(context_erase=0.5,
+                          n_batch=1,
+                          n_ctx=n_ctx,
+                          n_predict=max_new_tokens,
+                          repeat_last_n=64 if repetition_penalty != 1.0 else 0,
+                          repeat_penalty=repetition_penalty,
+                          temp=temperature,
+                          temperature=temperature,
+                          top_k=top_k,
+                          top_p=top_p,
+                          use_mlock=True,
+                          verbose=verbose)
     if model_name == 'llama':
+        cls = H2OLlamaCpp
+        model_path = env_kwargs.pop('model_path_llama') if model is None else model
+        model_kwargs = get_model_kwargs(env_kwargs, default_kwargs, cls)
+        model_kwargs.update(dict(model_path=model_path, callbacks=callbacks))
+        llm = cls(**model_kwargs)
+        llm.client.verbose = verbose
+    elif model_name == 'gpt4all_llama':
+        cls = H2OGPT4All
+        model_path = env_kwargs.pop('model_path_gpt4all_llama') if model is None else model
+        model_kwargs = get_model_kwargs(env_kwargs, default_kwargs, cls)
+        model_kwargs.update(dict(model=model_path, backend='llama', callbacks=callbacks))
+        llm = cls(**model_kwargs)
+    elif model_name == 'gptj':
+        cls = H2OGPT4All
+        model_path = env_kwargs.pop('model_path_gptj') if model is None else model
+        model_kwargs = get_model_kwargs(env_kwargs, default_kwargs, cls)
+        model_kwargs.update(dict(model=model_path, backend='gptj', callbacks=callbacks))
+        llm = cls(**model_kwargs)
     else:
+        raise RuntimeError("No such model_name %s" % model_name)
     return llm
         if verbose:
             print("_call prompt: %s" % prompt, flush=True)
         return super()._call(prompt, stop=stop, run_manager=run_manager)
+from langchain.llms import LlamaCpp
+class H2OLlamaCpp(LlamaCpp):
+    model_path: Any
+    """Path to the pre-trained GPT4All model file."""
+    @root_validator()
+    def validate_environment(cls, values: Dict) -> Dict:
+        """Validate that llama-cpp-python library is installed."""
+        if isinstance(values["model_path"], str):
+            model_path = values["model_path"]
+            model_param_names = [
+                "lora_path",
+                "lora_base",
+                "n_ctx",
+                "n_parts",
+                "seed",
+                "f16_kv",
+                "logits_all",
+                "vocab_only",
+                "use_mlock",
+                "n_threads",
+                "n_batch",
+                "use_mmap",
+                "last_n_tokens_size",
+            ]
+            model_params = {k: values[k] for k in model_param_names}
+            # For backwards compatibility, only include if non-null.
+            if values["n_gpu_layers"] is not None:
+                model_params["n_gpu_layers"] = values["n_gpu_layers"]
+            try:
+                from llama_cpp import Llama
+                values["client"] = Llama(model_path, **model_params)
+            except ImportError:
+                raise ModuleNotFoundError(
+                    "Could not import llama-cpp-python library. "
+                    "Please install the llama-cpp-python library to "
+                    "use this embedding model: pip install llama-cpp-python"
+                )
+            except Exception as e:
+                raise ValueError(
+                    f"Could not load Llama model from path: {model_path}. "
+                    f"Received error {e}"
+                )
+        else:
+            values["client"] = values["model_path"]
+        return values
+    def _call(
+            self,
+            prompt: str,
+            stop: Optional[List[str]] = None,
+            run_manager: Optional[CallbackManagerForLLMRun] = None,
+    ) -> str:
+        verbose = False
+        # tokenize twice, just to count tokens, since llama cpp python wrapper has no way to truncate
+        prompt_tokens = self.client.tokenize(b" " + prompt.encode("utf-8"))
+        num_prompt_tokens = len(prompt_tokens)
+        if num_prompt_tokens > self.n_ctx:
+            # conservative by using int()
+            chars_per_token = int(len(prompt) / num_prompt_tokens)
+            prompt = prompt[-self.n_ctx * chars_per_token:]
+            if verbose:
+                print("reducing tokens, assuming average of %s chars/token: %s" % chars_per_token, flush=True)
+                prompt_tokens2 = self.client.tokenize(b" " + prompt.encode("utf-8"))
+                num_prompt_tokens2 = len(prompt_tokens2)
+                print("reduced tokens from %d -> %d" % (num_prompt_tokens, num_prompt_tokens2), flush=True)
+        if verbose:
+            print("_call prompt: %s" % prompt, flush=True)
+        return super()._call(prompt, stop=stop, run_manager=run_manager)

gpt_langchain.py CHANGED Viewed

@@ -3,6 +3,7 @@ import inspect
 import os
 import pathlib
 import pickle
 import shutil
 import subprocess
 import sys
@@ -16,9 +17,11 @@ from functools import reduce
 from operator import concat
 from joblib import Parallel, delayed
 from utils import wrapped_partial, EThread, import_matplotlib, sanitize_filename, makedirs, get_url, flatten_list, \
-    get_device
 import_matplotlib()
@@ -35,7 +38,6 @@ from langchain.document_loaders import PyPDFLoader, TextLoader, CSVLoader, Pytho
     EverNoteLoader, UnstructuredEmailLoader, UnstructuredODTLoader, UnstructuredPowerPointLoader, \
     UnstructuredEPubLoader, UnstructuredImageLoader, UnstructuredRTFLoader, ArxivLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain.vectorstores import FAISS
 from langchain.chains.question_answering import load_qa_chain
 from langchain.docstore.document import Document
 from langchain import PromptTemplate
@@ -43,17 +45,36 @@ from langchain.vectorstores import Chroma
 def get_db(sources, use_openai_embedding=False, db_type='faiss', persist_directory="db_dir", langchain_mode='notset',
            hf_embedding_model="sentence-transformers/all-MiniLM-L6-v2"):
     if not sources:
         return None
     # get embedding model
     embedding = get_embedding(use_openai_embedding, hf_embedding_model=hf_embedding_model)
     # Create vector database
     if db_type == 'faiss':
         db = FAISS.from_documents(sources, embedding)
     elif db_type == 'chroma':
-        collection_name = langchain_mode.replace(' ', '_')
         os.makedirs(persist_directory, exist_ok=True)
         db = Chroma.from_documents(documents=sources,
                                    embedding=embedding,
@@ -61,34 +82,121 @@ def get_db(sources, use_openai_embedding=False, db_type='faiss', persist_directo
                                    collection_name=collection_name,
                                    anonymized_telemetry=False)
         db.persist()
-        # FIXME: below just proves can load persistent dir, regenerates its embedding files, so a bit wasteful
-        if False:
-            db = Chroma(embedding_function=embedding,
-                        persist_directory=persist_directory,
-                        collection_name=collection_name)
     else:
         raise RuntimeError("No such db_type=%s" % db_type)
     return db
-def add_to_db(db, sources, db_type='faiss', avoid_dup=True):
     if not sources:
-        return db
     if db_type == 'faiss':
         db.add_documents(sources)
     elif db_type == 'chroma':
-        if avoid_dup:
-            collection = db.get()
-            metadata_sources = set([x['source'] for x in collection['metadatas']])
-            sources = [x for x in sources if x.metadata['source'] not in metadata_sources]
-        if len(sources) == 0:
-            return db
         db.add_documents(documents=sources)
         db.persist()
     else:
         raise RuntimeError("No such db_type=%s" % db_type)
     return db
@@ -126,19 +234,23 @@ def get_llm(use_openai_model=False, model_name=None, model=None,
             top_k=40,
             top_p=0.7,
             prompt_type=None,
             ):
     if use_openai_model:
         from langchain.llms import OpenAI
         llm = OpenAI(temperature=0)
         model_name = 'openai'
         streamer = None
-    elif model_name in ['gptj', 'llama']:
         from gpt4all_llm import get_llm_gpt4all
         llm = get_llm_gpt4all(model_name, model=model, max_new_tokens=max_new_tokens,
                               temperature=temperature,
                               repetition_penalty=repetition_penalty,
                               top_k=top_k,
                               top_p=top_p,
                               )
         streamer = None
         prompt_type = 'plain'
@@ -149,6 +261,7 @@ def get_llm(use_openai_model=False, model_name=None, model=None,
             # only used if didn't pass model in
             assert model_name is None
             assert tokenizer is None
             model_name = 'h2oai/h2ogpt-oasst1-512-12b'
             # model_name = 'h2oai/h2ogpt-oig-oasst1-512-6_9b'
             # model_name = 'h2oai/h2ogpt-oasst1-512-20b'
@@ -165,7 +278,12 @@ def get_llm(use_openai_model=False, model_name=None, model=None,
                                                              torch_dtype=torch_dtype,
                                                              load_in_8bit=load_8bit)
-        gen_kwargs = dict(max_new_tokens=max_new_tokens, return_full_text=True, early_stopping=False)
         if stream_output:
             skip_prompt = False
             from generate import H2OTextIteratorStreamer
@@ -175,17 +293,19 @@ def get_llm(use_openai_model=False, model_name=None, model=None,
         else:
             streamer = None
-        if 'h2ogpt' in model_name or prompt_type == 'human_bot':
-            from h2oai_pipeline import H2OTextGenerationPipeline
-            pipe = H2OTextGenerationPipeline(model=model, tokenizer=tokenizer, **gen_kwargs)
-            # pipe.task = "text-generation"
-            # below makes it listen only to our prompt removal, not built in prompt removal that is less general and not specific for our model
-            pipe.task = "text2text-generation"
-            prompt_type = 'human_bot'
-        else:
-            # only for non-instruct tuned cases when ok with just normal next token prediction
-            from transformers import pipeline
-            pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, **gen_kwargs)
         from langchain.llms import HuggingFacePipeline
         llm = HuggingFacePipeline(pipeline=pipe)
@@ -341,6 +461,12 @@ try:
 except (pkg_resources.DistributionNotFound, AssertionError):
     have_arxiv = False
 image_types = ["png", "jpg", "jpeg"]
 non_image_types = ["pdf", "txt", "csv", "toml", "py", "rst", "rtf",
                    "md", "html",
@@ -357,9 +483,10 @@ file_types = non_image_types + image_types
 def add_meta(docs1, file):
     file_extension = pathlib.Path(file).suffix
     if not isinstance(docs1, list):
         docs1 = [docs1]
-    [x.metadata.update(dict(input_type=file_extension, date=str(datetime.now))) for x in docs1]
 def file_to_doc(file, base_path=None, verbose=False, fail_any_exception=False, chunk=True, chunk_size=512,
@@ -409,42 +536,45 @@ def file_to_doc(file, base_path=None, verbose=False, fail_any_exception=False, c
             f.write(file)
         metadata = dict(source=source_file, date=str(datetime.now()), input_type='pasted txt')
         doc1 = Document(page_content=file, metadata=metadata)
-    elif file.endswith('.html') or file.endswith('.mhtml'):
         docs1 = UnstructuredHTMLLoader(file_path=file).load()
         add_meta(docs1, file)
         doc1 = chunk_sources(docs1, chunk_size=chunk_size)
-    elif (file.endswith('.docx') or file.endswith('.doc')) and have_libreoffice:
         docs1 = UnstructuredWordDocumentLoader(file_path=file).load()
         add_meta(docs1, file)
         doc1 = chunk_sources(docs1, chunk_size=chunk_size)
-    elif file.endswith('.odt'):
         docs1 = UnstructuredODTLoader(file_path=file).load()
         add_meta(docs1, file)
         doc1 = chunk_sources(docs1, chunk_size=chunk_size)
-    elif file.endswith('pptx') or file.endswith('ppt'):
         docs1 = UnstructuredPowerPointLoader(file_path=file).load()
         add_meta(docs1, file)
         doc1 = chunk_sources(docs1, chunk_size=chunk_size)
-    elif file.endswith('.txt'):
         # use UnstructuredFileLoader ?
-        doc1 = TextLoader(file, encoding="utf8", autodetect_encoding=True).load()
         add_meta(doc1, file)
-    elif file.endswith('.rtf'):
         docs1 = UnstructuredRTFLoader(file).load()
         add_meta(docs1, file)
         doc1 = chunk_sources(docs1, chunk_size=chunk_size)
-    elif file.endswith('.md'):
         docs1 = UnstructuredMarkdownLoader(file).load()
         add_meta(docs1, file)
         doc1 = chunk_sources(docs1, chunk_size=chunk_size)
-    elif file.endswith('.enex'):
-        doc1 = EverNoteLoader(file).load()
         add_meta(doc1, file)
-    elif file.endswith('.epub'):
         docs1 = UnstructuredEPubLoader(file).load()
         add_meta(docs1, file)
         doc1 = chunk_sources(docs1, chunk_size=chunk_size)
-    elif file.endswith('.jpeg') or file.endswith('.jpg') or file.endswith('.png'):
         docs1 = []
         if have_tesseract and enable_ocr:
             # OCR, somewhat works, but not great
@@ -471,13 +601,14 @@ def file_to_doc(file, base_path=None, verbose=False, fail_any_exception=False, c
                 docs1.extend(docs1c)
             for doci in docs1:
                 doci.metadata['source'] = doci.metadata['image_path']
             if docs1:
                 doc1 = chunk_sources(docs1, chunk_size=chunk_size)
-    elif file.endswith('.msg'):
         raise RuntimeError("Not supported, GPL3 license")
         # docs1 = OutlookMessageLoader(file).load()
         # docs1[0].metadata['source'] = file
-    elif file.endswith('.eml'):
         try:
             docs1 = UnstructuredEmailLoader(file).load()
             add_meta(docs1, file)
@@ -491,34 +622,43 @@ def file_to_doc(file, base_path=None, verbose=False, fail_any_exception=False, c
                 doc1 = chunk_sources(docs1, chunk_size=chunk_size)
             else:
                 raise
-    # elif file.endswith('.gcsdir'):
     #    doc1 = GCSDirectoryLoader(project_name, bucket, prefix).load()
-    # elif file.endswith('.gcsfile'):
     # doc1 = GCSFileLoader(project_name, bucket, blob).load()
-    elif file.endswith('.rst'):
         with open(file, "r") as f:
             doc1 = Document(page_content=f.read(), metadata={"source": file})
         add_meta(doc1, file)
-    elif file.endswith('.pdf'):
         # Some PDFs return nothing or junk from PDFMinerLoader
-        # e.g. Beyond fine-tuning_ Classifying high resolution mammograms using function-preserving transformations _ Elsevier Enhanced Reader.pdf
-        doc1 = PyPDFLoader(file).load_and_split()
         add_meta(doc1, file)
-    elif file.endswith('.csv'):
         doc1 = CSVLoader(file).load()
         add_meta(doc1, file)
-    elif file.endswith('.py'):
         doc1 = PythonLoader(file).load()
         add_meta(doc1, file)
-    elif file.endswith('.toml'):
         doc1 = TomlLoader(file).load()
         add_meta(doc1, file)
-    elif file.endswith('.urls'):
         with open(file, "r") as f:
             docs1 = UnstructuredURLLoader(urls=f.readlines()).load()
         add_meta(docs1, file)
         doc1 = chunk_sources(docs1, chunk_size=chunk_size)
-    elif file.endswith('.zip'):
         with zipfile.ZipFile(file, 'r') as zip_ref:
             # don't put into temporary path, since want to keep references to docs inside zip
             # so just extract in path where
@@ -529,11 +669,17 @@ def file_to_doc(file, base_path=None, verbose=False, fail_any_exception=False, c
         raise RuntimeError("No file handler for %s" % os.path.basename(file))
     # allow doc1 to be list or not.  If not list, did not chunk yet, so chunk now
     if not isinstance(doc1, list):
         if chunk:
             docs = chunk_sources([doc1], chunk_size=chunk_size)
         else:
             docs = [doc1]
     else:
         docs = doc1
@@ -590,6 +736,8 @@ def path_to_docs(path_or_paths, verbose=False, fail_any_exception=False, n_jobs=
                  captions_model=None,
                  caption_loader=None,
                  enable_ocr=False,
                  ):
     globs_image_types = []
     globs_non_image_types = []
@@ -617,6 +765,28 @@ def path_to_docs(path_or_paths, verbose=False, fail_any_exception=False, n_jobs=
         # But instead, allow fail so can collect unsupported too
         set_globs_image_types = set(globs_image_types)
         globs_non_image_types.extend([x for x in path_or_paths if x not in set_globs_image_types])
     # could use generator, but messes up metadata handling in recursive case
     if caption_loader and not isinstance(caption_loader, (bool, str)) and \
             caption_loader.device != 'cpu' or \
@@ -643,21 +813,21 @@ def path_to_docs(path_or_paths, verbose=False, fail_any_exception=False, n_jobs=
     if n_jobs != 1 and len(globs_non_image_types) > 1:
         # avoid nesting, e.g. upload 1 zip and then inside many files
         # harder to handle if upload many zips with many files, inner parallel one will be disabled by joblib
-        documents = Parallel(n_jobs=n_jobs, verbose=10 if verbose else 0, backend='multiprocessing')(
             delayed(path_to_doc1)(file, **kwargs) for file in globs_non_image_types
         )
     else:
-        documents = [path_to_doc1(file, **kwargs) for file in globs_non_image_types]
     # do images separately since can't fork after cuda in parent, so can't be parallel
     if n_jobs_image != 1 and len(globs_image_types) > 1:
         # avoid nesting, e.g. upload 1 zip and then inside many files
         # harder to handle if upload many zips with many files, inner parallel one will be disabled by joblib
-        image_documents = Parallel(n_jobs=n_jobs, verbose=10 if verbose else 0, backend='multiprocessing')(
             delayed(path_to_doc1)(file, **kwargs) for file in globs_image_types
         )
     else:
-        image_documents = [path_to_doc1(file, **kwargs) for file in globs_image_types]
     # add image docs in
     documents += image_documents
@@ -676,7 +846,9 @@ def path_to_docs(path_or_paths, verbose=False, fail_any_exception=False, n_jobs=
     return documents
-def prep_langchain(persist_directory, load_db_if_exists, db_type, use_openai_embedding, langchain_mode, user_path,
                    hf_embedding_model, n_jobs=-1, kwargs_make_db={}):
     """
     do prep first time, involving downloads
@@ -685,12 +857,18 @@ def prep_langchain(persist_directory, load_db_if_exists, db_type, use_openai_emb
     """
     assert langchain_mode not in ['MyData'], "Should not prep scratch data"
-    if os.path.isdir(persist_directory):
         print("Prep: persist_directory=%s exists, using" % persist_directory, flush=True)
         db = get_existing_db(persist_directory, load_db_if_exists, db_type, use_openai_embedding, langchain_mode,
                              hf_embedding_model)
     else:
-        print("Prep: persist_directory=%s does not exist, regenerating" % persist_directory, flush=True)
         db = None
         if langchain_mode in ['All', 'DriverlessAI docs']:
             # FIXME: Could also just use dai_docs.pickle directly and upload that
@@ -701,19 +879,52 @@ def prep_langchain(persist_directory, load_db_if_exists, db_type, use_openai_emb
         langchain_kwargs = kwargs_make_db.copy()
         langchain_kwargs.update(locals())
-        db = make_db(**langchain_kwargs)
     return db
 def get_existing_db(persist_directory, load_db_if_exists, db_type, use_openai_embedding, langchain_mode,
                     hf_embedding_model):
     if load_db_if_exists and db_type == 'chroma' and os.path.isdir(persist_directory) and os.path.isdir(
             os.path.join(persist_directory, 'index')):
         print("DO Loading db: %s" % langchain_mode, flush=True)
         embedding = get_embedding(use_openai_embedding, hf_embedding_model=hf_embedding_model)
         db = Chroma(persist_directory=persist_directory, embedding_function=embedding,
-                    collection_name=langchain_mode.replace(' ', '_'))
         print("DONE Loading db: %s" % langchain_mode, flush=True)
         return db
     return None
@@ -740,21 +951,40 @@ def _make_db(use_openai_embedding=False,
              langchain_mode=None,
              user_path=None,
              db_type='faiss',
-             load_db_if_exists=False,
              db=None,
-             n_jobs=-1):
     persist_directory = 'db_dir_%s' % langchain_mode  # single place, no special names for each case
     if not db and load_db_if_exists and db_type == 'chroma' and os.path.isdir(persist_directory) and os.path.isdir(
             os.path.join(persist_directory, 'index')):
         assert langchain_mode not in ['MyData'], "Should not load MyData db this way"
-        print("Loading db", flush=True)
         embedding = get_embedding(use_openai_embedding, hf_embedding_model=hf_embedding_model)
         db = Chroma(persist_directory=persist_directory, embedding_function=embedding,
-                    collection_name=langchain_mode.replace(' ', '_'))
-    elif not db:
         assert langchain_mode not in ['MyData'], "Should not make MyData db this way"
-        sources = []
-        print("Generating sources", flush=True)
         if langchain_mode in ['wiki_full', 'All', "'All'"]:
             from read_wiki_full import get_all_documents
             small_test = None
@@ -783,9 +1013,25 @@ def _make_db(use_openai_embedding=False,
             sources.extend(sources1)
         if langchain_mode in ['All', 'UserData']:
             if user_path:
                 # chunk internally for speed over multiple docs
-                sources1 = path_to_docs(user_path, n_jobs=n_jobs, chunk=chunk, chunk_size=chunk_size)
                 sources.extend(sources1)
             else:
                 print("Chose UserData but user_path is empty/None", flush=True)
         if False and langchain_mode in ['urls', 'All', "'All'"]:
@@ -797,14 +1043,48 @@ def _make_db(use_openai_embedding=False,
             sources1 = loader.load()
             sources.extend(sources1)
         if not sources:
-            print("langchain_mode %s has no sources, not making db" % langchain_mode, flush=True)
-            return None
-        print("Generating db", flush=True)
-        db = get_db(sources, use_openai_embedding=use_openai_embedding, db_type=db_type,
-                    persist_directory=persist_directory, langchain_mode=langchain_mode,
-                    hf_embedding_model=hf_embedding_model)
-        print("Generated db", flush=True)
-    return db
 source_prefix = "Sources [Score | Link]:"
@@ -828,6 +1108,7 @@ def _run_qa_db(query=None,
                use_openai_model=False, use_openai_embedding=False,
                first_para=False, text_limit=None, k=4, chunk=False, chunk_size=1024,
                user_path=None,
                db_type='faiss',
                model_name=None, model=None, tokenizer=None,
                hf_embedding_model="sentence-transformers/all-MiniLM-L6-v2",
@@ -847,7 +1128,9 @@ def _run_qa_db(query=None,
                top_p=0.7,
                langchain_mode=None,
                document_choice=['All'],
-               n_jobs=-1):
     """
     :param query:
@@ -859,17 +1142,19 @@ def _run_qa_db(query=None,
     :param chunk:
     :param chunk_size:
     :param user_path: user path to glob recursively from
-    :param db_type: 'faiss' for in-memory db or 'chroma' for persistent db
     :param model_name: model name, used to switch behaviors
     :param model: pre-initialized model, else will make new one
     :param tokenizer: pre-initialized tokenizer, else will make new one.  Required not None if model is not None
     :param answer_with_sources
     :return:
     """
-    # FIXME: For All just go over all dbs instead of a separate db for All
-    db = make_db(**locals())
-    prompt_type = prompter.prompt_type if prompter is not None else prompt_type
     llm, model_name, streamer, prompt_type_out = get_llm(use_openai_model=use_openai_model, model_name=model_name,
                                                          model=model, tokenizer=tokenizer,
                                                          stream_output=stream_output,
@@ -879,74 +1164,173 @@ def _run_qa_db(query=None,
                                                          top_k=top_k,
                                                          top_p=top_p,
                                                          prompt_type=prompt_type,
                                                          )
-    if model_name in ['llama', 'gptj']:
         # FIXME: for now, streams to stdout/stderr currently
         stream_output = False
-    if not use_openai_model and prompt_type not in ['plain'] or model_name in ['llama', 'gptj']:
-        # instruct-like, rather than few-shot prompt_type='plain' as default
-        # but then sources confuse the model with how inserted among rest of text, so avoid
-        prefix = ""
         if langchain_mode in ['Disabled', 'ChatLLM', 'LLM']:
             use_context = False
-            template = """%s{context}{question}""" % prefix
         else:
             use_context = True
-            template = """%s
-==
-{context}
-==
-{question}""" % prefix
-        prompt = PromptTemplate(
-            # input_variables=["summaries", "question"],
-            input_variables=["context", "question"],
-            template=template,
-        )
-        chain = load_qa_chain(llm, prompt=prompt)
     else:
-        chain = load_qa_with_sources_chain(llm)
         use_context = True
-    if query is None:
-        query = "What are the main differences between Linux and Windows?"
     # https://github.com/hwchase17/langchain/issues/1946
     # FIXME: Seems to way to get size of chroma db to limit k to avoid
     # Chroma collection MyData contains fewer than 4 elements.
     # type logger error
     k_db = 1000 if db_type == 'chroma' else k  # k=100 works ok too for
     if db and use_context:
         if isinstance(document_choice, str):
             # support string as well
             document_choice = [document_choice]
-        if not isinstance(db, Chroma) or len(document_choice) <= 1 and document_choice[0].lower() == 'all':
             # treat empty list as All for now, not 'None'
             filter_kwargs = {}
         else:
             if len(document_choice) >= 2:
                 or_filter = [{"source": {"$eq": x}} for x in document_choice]
                 filter_kwargs = dict(filter={"$or": or_filter})
-            else:
                 one_filter = [{"source": {"$eq": x}} for x in document_choice][0]
                 filter_kwargs = dict(filter=one_filter)
-            if len(document_choice) == 1 and document_choice[0].lower() == 'none':
                 k_db = 1
                 k = 0
         docs_with_score = db.similarity_search_with_score(query, k=k_db, **filter_kwargs)[:k]
         # cut off so no high distance docs/sources considered
         docs = [x[0] for x in docs_with_score if x[1] < cut_distanct]
         scores = [x[1] for x in docs_with_score if x[1] < cut_distanct]
-        if len(scores) > 0:
             print("Distance: min: %s max: %s mean: %s median: %s" %
                   (scores[0], scores[-1], np.mean(scores), np.median(scores)), flush=True)
     else:
         docs = []
         scores = []
-    if not docs and use_context:
-        return None
     common_words_file = "data/NGSL_1.2_stats.csv.zip"
     if os.path.isfile(common_words_file):
@@ -958,88 +1342,82 @@ def _run_qa_db(query=None,
         num_common = len([x.lower() in set_common for x in reduced_query_words])
         frac_common = num_common / len(reduced_query) if reduced_query else 0
         # FIXME: report to user bad query that uses too many common words
-        print("frac_common: %s" % frac_common, flush=True)
-    if langchain_mode in ['Disabled', 'ChatLLM', 'LLM']:
         chain_kwargs = dict(input_documents=[], question=query)
     else:
         chain_kwargs = dict(input_documents=docs, question=query)
-    if stream_output:
-        answer = None
-        assert streamer is not None
-        target = wrapped_partial(chain, chain_kwargs)
-        import queue
-        bucket = queue.Queue()
-        thread = EThread(target=target, streamer=streamer, bucket=bucket)
-        thread.start()
-        outputs = ""
-        prompt = None  # FIXME
-        try:
-            for new_text in streamer:
-                # print("new_text: %s" % new_text, flush=True)
-                if bucket.qsize() > 0 or thread.exc:
-                    thread.join()
-                outputs += new_text
-                if prompter:  # and False:  # FIXME: pipeline can already use prompter
-                    output1 = prompter.get_response(outputs, prompt=prompt,
-                                                    sanitize_bot_response=sanitize_bot_response)
-                    yield output1
-                else:
-                    yield outputs
-        except BaseException:
-            # if any exception, raise that exception if was from thread, first
-            if thread.exc:
-                raise thread.exc
-            raise
-        finally:
-            # in case no exception and didn't join with thread yet, then join
-            if not thread.exc:
-                answer = thread.join()
-        # in case raise StopIteration or broke queue loop in streamer, but still have exception
-        if thread.exc:
-            raise thread.exc
-        # FIXME: answer is not string outputs from streamer.  How to get actual final output?
-        # answer = outputs
-    else:
-        answer = chain(chain_kwargs)
-    if not use_context:
-        ret = answer['output_text']
-        yield ret
-    elif answer is not None:
         print("query: %s" % query, flush=True)
         print("answer: %s" % answer['output_text'], flush=True)
-        # link
-        answer_sources = [(max(0.0, 1.5 - score) / 1.5, get_url(doc)) for score, doc in
-                          zip(scores, answer['input_documents'])]
-        answer_sources_dict = defaultdict(list)
-        [answer_sources_dict[url].append(score) for score, url in answer_sources]
-        answers_dict = {}
-        for url, scores_url in answer_sources_dict.items():
-            answers_dict[url] = np.max(scores_url)
-        answer_sources = [(score, url) for url, score in answers_dict.items()]
-        answer_sources.sort(key=lambda x: x[0], reverse=True)
-        if show_rank:
-            # answer_sources = ['%d | %s' % (1 + rank, url) for rank, (score, url) in enumerate(answer_sources)]
-            # sorted_sources_urls = "Sources [Rank | Link]:<br>" + "<br>".join(answer_sources)
-            answer_sources = ['%s' % url for rank, (score, url) in enumerate(answer_sources)]
-            sorted_sources_urls = "Ranked Sources:<br>" + "<br>".join(answer_sources)
-        else:
-            answer_sources = ['<li>%.2g | %s</li>' % (score, url) for score, url in answer_sources]
-            sorted_sources_urls = f"{source_prefix}<p><ul>" + "<p>".join(answer_sources)
-            sorted_sources_urls += f"</ul></p>{source_postfix}"
-        if not answer['output_text'].endswith('\n'):
-            answer['output_text'] += '\n'
-        if answer_with_sources:
-            ret = answer['output_text'] + '\n' + sorted_sources_urls
-        else:
-            ret = answer['output_text']
-        yield ret
-    return
 def chunk_sources(sources, chunk_size=1024):

 import os
 import pathlib
 import pickle
+import queue
 import shutil
 import subprocess
 import sys
 from operator import concat
 from joblib import Parallel, delayed
+from tqdm import tqdm
+from prompter import non_hf_types
 from utils import wrapped_partial, EThread, import_matplotlib, sanitize_filename, makedirs, get_url, flatten_list, \
+    get_device, ProgressParallel, remove, hash_file
 import_matplotlib()
     EverNoteLoader, UnstructuredEmailLoader, UnstructuredODTLoader, UnstructuredPowerPointLoader, \
     UnstructuredEPubLoader, UnstructuredImageLoader, UnstructuredRTFLoader, ArxivLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.chains.question_answering import load_qa_chain
 from langchain.docstore.document import Document
 from langchain import PromptTemplate
 def get_db(sources, use_openai_embedding=False, db_type='faiss', persist_directory="db_dir", langchain_mode='notset',
+           collection_name=None,
            hf_embedding_model="sentence-transformers/all-MiniLM-L6-v2"):
     if not sources:
         return None
     # get embedding model
     embedding = get_embedding(use_openai_embedding, hf_embedding_model=hf_embedding_model)
+    assert collection_name is not None or langchain_mode != 'notset'
+    if collection_name is None:
+        collection_name = langchain_mode.replace(' ', '_')
     # Create vector database
     if db_type == 'faiss':
+        from langchain.vectorstores import FAISS
         db = FAISS.from_documents(sources, embedding)
+    elif db_type == 'weaviate':
+        import weaviate
+        from weaviate.embedded import EmbeddedOptions
+        from langchain.vectorstores import Weaviate
+        # TODO: add support for connecting via docker compose
+        client = weaviate.Client(
+            embedded_options=EmbeddedOptions()
+        )
+        index_name = collection_name.capitalize()
+        db = Weaviate.from_documents(documents=sources, embedding=embedding, client=client, by_text=False,
+                                     index_name=index_name)
     elif db_type == 'chroma':
+        assert persist_directory is not None
         os.makedirs(persist_directory, exist_ok=True)
         db = Chroma.from_documents(documents=sources,
                                    embedding=embedding,
                                    collection_name=collection_name,
                                    anonymized_telemetry=False)
         db.persist()
     else:
         raise RuntimeError("No such db_type=%s" % db_type)
     return db
+def _get_unique_sources_in_weaviate(db):
+    batch_size = 100
+    id_source_list = []
+    result = db._client.data_object.get(class_name=db._index_name, limit=batch_size)
+    while result['objects']:
+        id_source_list += [(obj['id'], obj['properties']['source']) for obj in result['objects']]
+        last_id = id_source_list[-1][0]
+        result = db._client.data_object.get(class_name=db._index_name, limit=batch_size, after=last_id)
+    unique_sources = {source for _, source in id_source_list}
+    return unique_sources
+def add_to_db(db, sources, db_type='faiss',
+              avoid_dup_by_file=False,
+              avoid_dup_by_content=True):
+    num_new_sources = len(sources)
     if not sources:
+        return db, num_new_sources, []
     if db_type == 'faiss':
         db.add_documents(sources)
+    elif db_type == 'weaviate':
+        # FIXME: only control by file name, not hash yet
+        if avoid_dup_by_file or avoid_dup_by_content:
+            unique_sources = _get_unique_sources_in_weaviate(db)
+            sources = [x for x in sources if x.metadata['source'] not in unique_sources]
+        num_new_sources = len(sources)
+        if num_new_sources == 0:
+            return db, num_new_sources, []
+        db.add_documents(documents=sources)
     elif db_type == 'chroma':
+        collection = db.get()
+        # files we already have:
+        metadata_files = set([x['source'] for x in collection['metadatas']])
+        if avoid_dup_by_file:
+            # Too weak in case file changed content, assume parent shouldn't pass true for this for now
+            raise RuntimeError("Not desired code path")
+            sources = [x for x in sources if x.metadata['source'] not in metadata_files]
+        if avoid_dup_by_content:
+            # look at hash, instead of page_content
+            # migration: If no hash previously, avoid updating,
+            #  since don't know if need to update and may be expensive to redo all unhashed files
+            metadata_hash_ids = set(
+                [x['hashid'] for x in collection['metadatas'] if 'hashid' in x and x['hashid'] not in ["None", None]])
+            # avoid sources with same hash
+            sources = [x for x in sources if x.metadata.get('hashid') not in metadata_hash_ids]
+            # get new file names that match existing file names.  delete existing files we are overridding
+            dup_metadata_files = set([x.metadata['source'] for x in sources if x.metadata['source'] in metadata_files])
+            print("Removing %s duplicate files from db because ingesting those as new documents" % len(
+                dup_metadata_files), flush=True)
+            client_collection = db._client.get_collection(name=db._collection.name)
+            for dup_file in dup_metadata_files:
+                dup_file_meta = dict(source=dup_file)
+                try:
+                    client_collection.delete(where=dup_file_meta)
+                except KeyError:
+                    pass
+        num_new_sources = len(sources)
+        if num_new_sources == 0:
+            return db, num_new_sources, []
         db.add_documents(documents=sources)
         db.persist()
     else:
         raise RuntimeError("No such db_type=%s" % db_type)
+    new_sources_metadata = [x.metadata for x in sources]
+    return db, num_new_sources, new_sources_metadata
+def create_or_update_db(db_type, persist_directory, collection_name,
+                        sources, use_openai_embedding, add_if_exists, verbose, hf_embedding_model):
+    if db_type == 'weaviate':
+        import weaviate
+        from weaviate.embedded import EmbeddedOptions
+        # TODO: add support for connecting via docker compose
+        client = weaviate.Client(
+            embedded_options=EmbeddedOptions()
+        )
+        index_name = collection_name.replace(' ', '_').capitalize()
+        if client.schema.exists(index_name) and not add_if_exists:
+            client.schema.delete_class(index_name)
+            if verbose:
+                print("Removing %s" % index_name, flush=True)
+    elif db_type == 'chroma':
+        if not os.path.isdir(persist_directory) or not add_if_exists:
+            if os.path.isdir(persist_directory):
+                if verbose:
+                    print("Removing %s" % persist_directory, flush=True)
+                remove(persist_directory)
+            if verbose:
+                print("Generating db", flush=True)
+    if not add_if_exists:
+        if verbose:
+            print("Generating db", flush=True)
+    else:
+        if verbose:
+            print("Loading and updating db", flush=True)
+    db = get_db(sources,
+                use_openai_embedding=use_openai_embedding,
+                db_type=db_type,
+                persist_directory=persist_directory,
+                langchain_mode=collection_name,
+                hf_embedding_model=hf_embedding_model)
     return db
             top_k=40,
             top_p=0.7,
             prompt_type=None,
+            prompter=None,
+            verbose=False,
             ):
     if use_openai_model:
         from langchain.llms import OpenAI
         llm = OpenAI(temperature=0)
         model_name = 'openai'
         streamer = None
+        prompt_type = 'plain'
+    elif model_name in non_hf_types:
         from gpt4all_llm import get_llm_gpt4all
         llm = get_llm_gpt4all(model_name, model=model, max_new_tokens=max_new_tokens,
                               temperature=temperature,
                               repetition_penalty=repetition_penalty,
                               top_k=top_k,
                               top_p=top_p,
+                              verbose=verbose,
                               )
         streamer = None
         prompt_type = 'plain'
             # only used if didn't pass model in
             assert model_name is None
             assert tokenizer is None
+            prompt_type = 'human_bot'
             model_name = 'h2oai/h2ogpt-oasst1-512-12b'
             # model_name = 'h2oai/h2ogpt-oig-oasst1-512-6_9b'
             # model_name = 'h2oai/h2ogpt-oasst1-512-20b'
                                                              torch_dtype=torch_dtype,
                                                              load_in_8bit=load_8bit)
+        max_max_tokens = tokenizer.model_max_length
+        gen_kwargs = dict(max_new_tokens=max_new_tokens,
+                          return_full_text=True,
+                          early_stopping=False,
+                          handle_long_generation='hole')
         if stream_output:
             skip_prompt = False
             from generate import H2OTextIteratorStreamer
         else:
             streamer = None
+        from h2oai_pipeline import H2OTextGenerationPipeline
+        pipe = H2OTextGenerationPipeline(model=model, use_prompter=True,
+                                         prompter=prompter,
+                                         prompt_type=prompt_type,
+                                         sanitize_bot_response=True,
+                                         chat=False, stream_output=stream_output,
+                                         tokenizer=tokenizer,
+                                         max_input_tokens=max_max_tokens - max_new_tokens,
+                                         **gen_kwargs)
+        # pipe.task = "text-generation"
+        # below makes it listen only to our prompt removal,
+        # not built in prompt removal that is less general and not specific for our model
+        pipe.task = "text2text-generation"
         from langchain.llms import HuggingFacePipeline
         llm = HuggingFacePipeline(pipeline=pipe)
 except (pkg_resources.DistributionNotFound, AssertionError):
     have_arxiv = False
+try:
+    assert pkg_resources.get_distribution('pymupdf') is not None
+    have_pymupdf = True
+except (pkg_resources.DistributionNotFound, AssertionError):
+    have_pymupdf = False
 image_types = ["png", "jpg", "jpeg"]
 non_image_types = ["pdf", "txt", "csv", "toml", "py", "rst", "rtf",
                    "md", "html",
 def add_meta(docs1, file):
     file_extension = pathlib.Path(file).suffix
+    hashid = hash_file(file)
     if not isinstance(docs1, list):
         docs1 = [docs1]
+    [x.metadata.update(dict(input_type=file_extension, date=str(datetime.now), hashid=hashid)) for x in docs1]
 def file_to_doc(file, base_path=None, verbose=False, fail_any_exception=False, chunk=True, chunk_size=512,
             f.write(file)
         metadata = dict(source=source_file, date=str(datetime.now()), input_type='pasted txt')
         doc1 = Document(page_content=file, metadata=metadata)
+    elif file.lower().endswith('.html') or file.lower().endswith('.mhtml'):
         docs1 = UnstructuredHTMLLoader(file_path=file).load()
         add_meta(docs1, file)
         doc1 = chunk_sources(docs1, chunk_size=chunk_size)
+    elif (file.lower().endswith('.docx') or file.lower().endswith('.doc')) and have_libreoffice:
         docs1 = UnstructuredWordDocumentLoader(file_path=file).load()
         add_meta(docs1, file)
         doc1 = chunk_sources(docs1, chunk_size=chunk_size)
+    elif file.lower().endswith('.odt'):
         docs1 = UnstructuredODTLoader(file_path=file).load()
         add_meta(docs1, file)
         doc1 = chunk_sources(docs1, chunk_size=chunk_size)
+    elif file.lower().endswith('pptx') or file.lower().endswith('ppt'):
         docs1 = UnstructuredPowerPointLoader(file_path=file).load()
         add_meta(docs1, file)
         doc1 = chunk_sources(docs1, chunk_size=chunk_size)
+    elif file.lower().endswith('.txt'):
         # use UnstructuredFileLoader ?
+        docs1 = TextLoader(file, encoding="utf8", autodetect_encoding=True).load()
+        # makes just one, but big one
+        doc1 = chunk_sources(docs1, chunk_size=chunk_size)
         add_meta(doc1, file)
+    elif file.lower().endswith('.rtf'):
         docs1 = UnstructuredRTFLoader(file).load()
         add_meta(docs1, file)
         doc1 = chunk_sources(docs1, chunk_size=chunk_size)
+    elif file.lower().endswith('.md'):
         docs1 = UnstructuredMarkdownLoader(file).load()
         add_meta(docs1, file)
         doc1 = chunk_sources(docs1, chunk_size=chunk_size)
+    elif file.lower().endswith('.enex'):
+        docs1 = EverNoteLoader(file).load()
         add_meta(doc1, file)
+        doc1 = chunk_sources(docs1, chunk_size=chunk_size)
+    elif file.lower().endswith('.epub'):
         docs1 = UnstructuredEPubLoader(file).load()
         add_meta(docs1, file)
         doc1 = chunk_sources(docs1, chunk_size=chunk_size)
+    elif file.lower().endswith('.jpeg') or file.lower().endswith('.jpg') or file.lower().endswith('.png'):
         docs1 = []
         if have_tesseract and enable_ocr:
             # OCR, somewhat works, but not great
                 docs1.extend(docs1c)
             for doci in docs1:
                 doci.metadata['source'] = doci.metadata['image_path']
+                doci.metadata['hash'] = hash_file(doci.metadata['source'])
             if docs1:
                 doc1 = chunk_sources(docs1, chunk_size=chunk_size)
+    elif file.lower().endswith('.msg'):
         raise RuntimeError("Not supported, GPL3 license")
         # docs1 = OutlookMessageLoader(file).load()
         # docs1[0].metadata['source'] = file
+    elif file.lower().endswith('.eml'):
         try:
             docs1 = UnstructuredEmailLoader(file).load()
             add_meta(docs1, file)
                 doc1 = chunk_sources(docs1, chunk_size=chunk_size)
             else:
                 raise
+    # elif file.lower().endswith('.gcsdir'):
     #    doc1 = GCSDirectoryLoader(project_name, bucket, prefix).load()
+    # elif file.lower().endswith('.gcsfile'):
     # doc1 = GCSFileLoader(project_name, bucket, blob).load()
+    elif file.lower().endswith('.rst'):
         with open(file, "r") as f:
             doc1 = Document(page_content=f.read(), metadata={"source": file})
         add_meta(doc1, file)
+    elif file.lower().endswith('.pdf'):
+        env_gpt4all_file = ".env_gpt4all"
+        from dotenv import dotenv_values
+        env_kwargs = dotenv_values(env_gpt4all_file)
+        pdf_class_name = env_kwargs.get('PDF_CLASS_NAME', 'PyMuPDFParser')
+        if have_pymupdf and pdf_class_name == 'PyMuPDFParser':
+            # GPL, only use if installed
+            from langchain.document_loaders import PyMuPDFLoader
+            doc1 = PyMuPDFLoader(file).load_and_split()
+        else:
+            # open-source fallback
+            doc1 = PyPDFLoader(file).load_and_split()
         # Some PDFs return nothing or junk from PDFMinerLoader
         add_meta(doc1, file)
+    elif file.lower().endswith('.csv'):
         doc1 = CSVLoader(file).load()
         add_meta(doc1, file)
+    elif file.lower().endswith('.py'):
         doc1 = PythonLoader(file).load()
         add_meta(doc1, file)
+    elif file.lower().endswith('.toml'):
         doc1 = TomlLoader(file).load()
         add_meta(doc1, file)
+    elif file.lower().endswith('.urls'):
         with open(file, "r") as f:
             docs1 = UnstructuredURLLoader(urls=f.readlines()).load()
         add_meta(docs1, file)
         doc1 = chunk_sources(docs1, chunk_size=chunk_size)
+    elif file.lower().endswith('.zip'):
         with zipfile.ZipFile(file, 'r') as zip_ref:
             # don't put into temporary path, since want to keep references to docs inside zip
             # so just extract in path where
         raise RuntimeError("No file handler for %s" % os.path.basename(file))
     # allow doc1 to be list or not.  If not list, did not chunk yet, so chunk now
+    # if list of length one, don't trust and chunk it
     if not isinstance(doc1, list):
         if chunk:
             docs = chunk_sources([doc1], chunk_size=chunk_size)
         else:
             docs = [doc1]
+    elif isinstance(doc1, list) and len(doc1) == 1:
+        if chunk:
+            docs = chunk_sources(doc1, chunk_size=chunk_size)
+        else:
+            docs = doc1
     else:
         docs = doc1
                  captions_model=None,
                  caption_loader=None,
                  enable_ocr=False,
+                 existing_files=[],
+                 existing_hash_ids={},
                  ):
     globs_image_types = []
     globs_non_image_types = []
         # But instead, allow fail so can collect unsupported too
         set_globs_image_types = set(globs_image_types)
         globs_non_image_types.extend([x for x in path_or_paths if x not in set_globs_image_types])
+    # filter out any files to skip (e.g. if already processed them)
+    # this is easy, but too aggressive in case a file changed, so parent probably passed existing_files=[]
+    assert not existing_files, "DEV: assume not using this approach"
+    if existing_files:
+        set_skip_files = set(existing_files)
+        globs_image_types = [x for x in globs_image_types if x not in set_skip_files]
+        globs_non_image_types = [x for x in globs_non_image_types if x not in set_skip_files]
+    if existing_hash_ids:
+        # assume consistent with add_meta() use of hash_file(file)
+        # also assume consistent with get_existing_hash_ids for dict creation
+        # assume hashable values
+        existing_hash_ids_set = set(existing_hash_ids.items())
+        hash_ids_all_image = set({x: hash_file(x) for x in globs_image_types}.items())
+        hash_ids_all_non_image = set({x: hash_file(x) for x in globs_non_image_types}.items())
+        # don't use symmetric diff.  If file is gone, ignore and don't remove or something
+        #  just consider existing files (key) having new hash or not (value)
+        new_files_image = set(dict(hash_ids_all_image - existing_hash_ids_set).keys())
+        new_files_non_image = set(dict(hash_ids_all_non_image - existing_hash_ids_set).keys())
+        globs_image_types = [x for x in globs_image_types if x in new_files_image]
+        globs_non_image_types = [x for x in globs_non_image_types if x in new_files_non_image]
     # could use generator, but messes up metadata handling in recursive case
     if caption_loader and not isinstance(caption_loader, (bool, str)) and \
             caption_loader.device != 'cpu' or \
     if n_jobs != 1 and len(globs_non_image_types) > 1:
         # avoid nesting, e.g. upload 1 zip and then inside many files
         # harder to handle if upload many zips with many files, inner parallel one will be disabled by joblib
+        documents = ProgressParallel(n_jobs=n_jobs, verbose=10 if verbose else 0, backend='multiprocessing')(
             delayed(path_to_doc1)(file, **kwargs) for file in globs_non_image_types
         )
     else:
+        documents = [path_to_doc1(file, **kwargs) for file in tqdm(globs_non_image_types)]
     # do images separately since can't fork after cuda in parent, so can't be parallel
     if n_jobs_image != 1 and len(globs_image_types) > 1:
         # avoid nesting, e.g. upload 1 zip and then inside many files
         # harder to handle if upload many zips with many files, inner parallel one will be disabled by joblib
+        image_documents = ProgressParallel(n_jobs=n_jobs, verbose=10 if verbose else 0, backend='multiprocessing')(
             delayed(path_to_doc1)(file, **kwargs) for file in globs_image_types
         )
     else:
+        image_documents = [path_to_doc1(file, **kwargs) for file in tqdm(globs_image_types)]
     # add image docs in
     documents += image_documents
     return documents
+def prep_langchain(persist_directory,
+                   load_db_if_exists,
+                   db_type, use_openai_embedding, langchain_mode, user_path,
                    hf_embedding_model, n_jobs=-1, kwargs_make_db={}):
     """
     do prep first time, involving downloads
     """
     assert langchain_mode not in ['MyData'], "Should not prep scratch data"
+    db_dir_exists = os.path.isdir(persist_directory)
+    if db_dir_exists and user_path is None:
         print("Prep: persist_directory=%s exists, using" % persist_directory, flush=True)
         db = get_existing_db(persist_directory, load_db_if_exists, db_type, use_openai_embedding, langchain_mode,
                              hf_embedding_model)
     else:
+        if db_dir_exists and user_path is not None:
+            print("Prep: persist_directory=%s exists, user_path=%s passed, adding any changed or new documents" % (
+            persist_directory, user_path), flush=True)
+        elif not db_dir_exists:
+            print("Prep: persist_directory=%s does not exist, regenerating" % persist_directory, flush=True)
         db = None
         if langchain_mode in ['All', 'DriverlessAI docs']:
             # FIXME: Could also just use dai_docs.pickle directly and upload that
         langchain_kwargs = kwargs_make_db.copy()
         langchain_kwargs.update(locals())
+        db, num_new_sources, new_sources_metadata = make_db(**langchain_kwargs)
     return db
+import posthog
+posthog.disabled = True
+class FakeConsumer(object):
+    def __init__(self, *args, **kwargs):
+        pass
+    def run(self):
+        pass
+    def pause(self):
+        pass
+    def upload(self):
+        pass
+    def next(self):
+        pass
+    def request(self, batch):
+        pass
+posthog.Consumer = FakeConsumer
 def get_existing_db(persist_directory, load_db_if_exists, db_type, use_openai_embedding, langchain_mode,
                     hf_embedding_model):
     if load_db_if_exists and db_type == 'chroma' and os.path.isdir(persist_directory) and os.path.isdir(
             os.path.join(persist_directory, 'index')):
         print("DO Loading db: %s" % langchain_mode, flush=True)
         embedding = get_embedding(use_openai_embedding, hf_embedding_model=hf_embedding_model)
+        from chromadb.config import Settings
+        client_settings = Settings(anonymized_telemetry=False,
+                                   chroma_db_impl="duckdb+parquet",
+                                   persist_directory=persist_directory)
         db = Chroma(persist_directory=persist_directory, embedding_function=embedding,
+                    collection_name=langchain_mode.replace(' ', '_'),
+                    client_settings=client_settings)
         print("DONE Loading db: %s" % langchain_mode, flush=True)
         return db
     return None
              langchain_mode=None,
              user_path=None,
              db_type='faiss',
+             load_db_if_exists=True,
              db=None,
+             n_jobs=-1,
+             verbose=False):
     persist_directory = 'db_dir_%s' % langchain_mode  # single place, no special names for each case
     if not db and load_db_if_exists and db_type == 'chroma' and os.path.isdir(persist_directory) and os.path.isdir(
             os.path.join(persist_directory, 'index')):
         assert langchain_mode not in ['MyData'], "Should not load MyData db this way"
+        print("Loading existing db", flush=True)
         embedding = get_embedding(use_openai_embedding, hf_embedding_model=hf_embedding_model)
+        from chromadb.config import Settings
+        client_settings = Settings(anonymized_telemetry=False,
+                                   chroma_db_impl="duckdb+parquet",
+                                   persist_directory=persist_directory)
         db = Chroma(persist_directory=persist_directory, embedding_function=embedding,
+                    collection_name=langchain_mode.replace(' ', '_'),
+                    client_settings=client_settings)
+    sources = []
+    if not db and langchain_mode not in ['MyData'] or \
+            user_path is not None and \
+            langchain_mode in ['UserData']:
+        # Should not make MyData db this way, why avoided, only upload from UI
         assert langchain_mode not in ['MyData'], "Should not make MyData db this way"
+        if verbose:
+            if langchain_mode in ['UserData']:
+                if user_path is not None:
+                    print("Checking if changed or new sources in %s, and generating sources them" % user_path,
+                          flush=True)
+                elif db is None:
+                    print("user_path not passed and no db, no sources", flush=True)
+                else:
+                    print("user_path not passed, using only existing db, no new sources", flush=True)
+            else:
+                print("Generating %s sources" % langchain_mode, flush=True)
         if langchain_mode in ['wiki_full', 'All', "'All'"]:
             from read_wiki_full import get_all_documents
             small_test = None
             sources.extend(sources1)
         if langchain_mode in ['All', 'UserData']:
             if user_path:
+                if db is not None:
+                    # NOTE: Ignore file names for now, only go by hash ids
+                    # existing_files = get_existing_files(db)
+                    existing_files = []
+                    existing_hash_ids = get_existing_hash_ids(db)
+                else:
+                    # pretend no existing files so won't filter
+                    existing_files = []
+                    existing_hash_ids = []
                 # chunk internally for speed over multiple docs
+                sources1 = path_to_docs(user_path, n_jobs=n_jobs, chunk=chunk, chunk_size=chunk_size,
+                                        existing_files=existing_files, existing_hash_ids=existing_hash_ids)
+                new_metadata_sources = set([x.metadata['source'] for x in sources1])
+                if new_metadata_sources:
+                    print("Loaded %s new files as sources to add to UserData" % len(new_metadata_sources), flush=True)
+                    if verbose:
+                        print("Files added: %s" % '\n'.join(new_metadata_sources), flush=True)
                 sources.extend(sources1)
+                print("Loaded %s sources for potentially adding to UserData" % len(sources), flush=True)
             else:
                 print("Chose UserData but user_path is empty/None", flush=True)
         if False and langchain_mode in ['urls', 'All', "'All'"]:
             sources1 = loader.load()
             sources.extend(sources1)
         if not sources:
+            if verbose:
+                if db is not None:
+                    print("langchain_mode %s has no new sources, nothing to add to db" % langchain_mode, flush=True)
+                else:
+                    print("langchain_mode %s has no sources, not making new db" % langchain_mode, flush=True)
+            return db, 0, []
+        if verbose:
+            if db is not None:
+                print("Generating db", flush=True)
+            else:
+                print("Adding to db", flush=True)
+    if not db:
+        if sources:
+            db = get_db(sources, use_openai_embedding=use_openai_embedding, db_type=db_type,
+                        persist_directory=persist_directory, langchain_mode=langchain_mode,
+                        hf_embedding_model=hf_embedding_model)
+            if verbose:
+                print("Generated db", flush=True)
+        else:
+            print("Did not generate db since no sources", flush=True)
+        new_sources_metadata = [x.metadata for x in sources]
+    elif user_path is not None and langchain_mode in ['UserData']:
+        print("Existing db, potentially adding %s sources from user_path=%s" % (len(sources), user_path), flush=True)
+        db, num_new_sources, new_sources_metadata = add_to_db(db, sources, db_type=db_type)
+        print("Existing db, added %s new sources from user_path=%s" % (num_new_sources, user_path), flush=True)
+    else:
+        new_sources_metadata = [x.metadata for x in sources]
+    return db, len(new_sources_metadata), new_sources_metadata
+def get_existing_files(db):
+    collection = db.get()
+    metadata_sources = set([x['source'] for x in collection['metadatas']])
+    return metadata_sources
+def get_existing_hash_ids(db):
+    collection = db.get()
+    # assume consistency, that any prior hashed source was single hashed file at the time among all source chunks
+    metadata_hash_ids = {x['source']: x.get('hashid') for x in collection['metadatas']}
+    return metadata_hash_ids
 source_prefix = "Sources [Score | Link]:"
                use_openai_model=False, use_openai_embedding=False,
                first_para=False, text_limit=None, k=4, chunk=False, chunk_size=1024,
                user_path=None,
+               detect_user_path_changes_every_query=False,
                db_type='faiss',
                model_name=None, model=None, tokenizer=None,
                hf_embedding_model="sentence-transformers/all-MiniLM-L6-v2",
                top_p=0.7,
                langchain_mode=None,
                document_choice=['All'],
+               n_jobs=-1,
+               verbose=False,
+               cli=False):
     """
     :param query:
     :param chunk:
     :param chunk_size:
     :param user_path: user path to glob recursively from
+    :param db_type: 'faiss' for in-memory db or 'chroma' or 'weaviate' for persistent db
     :param model_name: model name, used to switch behaviors
     :param model: pre-initialized model, else will make new one
     :param tokenizer: pre-initialized tokenizer, else will make new one.  Required not None if model is not None
     :param answer_with_sources
     :return:
     """
+    assert query is not None
+    assert prompter is not None or prompt_type is not None or model is None  # if model is None, then will generate
+    if prompter is not None:
+        prompt_type = prompter.prompt_type
+    if model is not None:
+        assert prompt_type is not None
     llm, model_name, streamer, prompt_type_out = get_llm(use_openai_model=use_openai_model, model_name=model_name,
                                                          model=model, tokenizer=tokenizer,
                                                          stream_output=stream_output,
                                                          top_k=top_k,
                                                          top_p=top_p,
                                                          prompt_type=prompt_type,
+                                                         prompter=prompter,
+                                                         verbose=verbose,
                                                          )
+    if model_name in non_hf_types:
         # FIXME: for now, streams to stdout/stderr currently
         stream_output = False
+    use_context = False
+    scores = []
+    chain = None
+    func_names = list(inspect.signature(get_similarity_chain).parameters)
+    sim_kwargs = {k: v for k, v in locals().items() if k in func_names}
+    missing_kwargs = [x for x in func_names if x not in sim_kwargs]
+    assert not missing_kwargs, "Missing: %s" % missing_kwargs
+    docs, chain, scores, use_context = get_similarity_chain(**sim_kwargs)
+    if len(document_choice) > 0 and document_choice[0] == 'Only':
+        formatted_doc_chunks = '\n\n'.join([get_url(x) + '\n\n' + x.page_content for x in docs])
+        yield formatted_doc_chunks, ''
+        return
+    if chain is None and model_name not in non_hf_types:
+        # can only return if HF type
+        return
+    if stream_output:
+        answer = None
+        assert streamer is not None
+        import queue
+        bucket = queue.Queue()
+        thread = EThread(target=chain, streamer=streamer, bucket=bucket)
+        thread.start()
+        outputs = ""
+        prompt = None  # FIXME
+        try:
+            for new_text in streamer:
+                # print("new_text: %s" % new_text, flush=True)
+                if bucket.qsize() > 0 or thread.exc:
+                    thread.join()
+                outputs += new_text
+                if prompter:  # and False:  # FIXME: pipeline can already use prompter
+                    output1 = prompter.get_response(outputs, prompt=prompt,
+                                                    sanitize_bot_response=sanitize_bot_response)
+                    yield output1, ''
+                else:
+                    yield outputs, ''
+        except BaseException:
+            # if any exception, raise that exception if was from thread, first
+            if thread.exc:
+                raise thread.exc
+            raise
+        finally:
+            # in case no exception and didn't join with thread yet, then join
+            if not thread.exc:
+                answer = thread.join()
+        # in case raise StopIteration or broke queue loop in streamer, but still have exception
+        if thread.exc:
+            raise thread.exc
+        # FIXME: answer is not string outputs from streamer.  How to get actual final output?
+        # answer = outputs
+    else:
+        answer = chain()
+    if not use_context:
+        ret = answer['output_text']
+        extra = ''
+        yield ret, extra
+    elif answer is not None:
+        ret, extra = get_sources_answer(query, answer, scores, show_rank, answer_with_sources, verbose=verbose)
+        yield ret, extra
+    return
+def get_similarity_chain(query=None,
+                         use_openai_model=False, use_openai_embedding=False,
+                         first_para=False, text_limit=None, k=4, chunk=False, chunk_size=1024,
+                         user_path=None,
+                         detect_user_path_changes_every_query=False,
+                         db_type='faiss',
+                         model_name=None,
+                         hf_embedding_model="sentence-transformers/all-MiniLM-L6-v2",
+                         prompt_type=None,
+                         cut_distanct=1.1,
+                         load_db_if_exists=False,
+                         db=None,
+                         langchain_mode=None,
+                         document_choice=['All'],
+                         n_jobs=-1,
+                         # beyond run_db_query:
+                         llm=None,
+                         verbose=False,
+                         ):
+    # determine whether use of context out of docs is planned
+    if not use_openai_model and prompt_type not in ['plain'] or model_name in non_hf_types:
         if langchain_mode in ['Disabled', 'ChatLLM', 'LLM']:
             use_context = False
         else:
             use_context = True
     else:
         use_context = True
     # https://github.com/hwchase17/langchain/issues/1946
     # FIXME: Seems to way to get size of chroma db to limit k to avoid
     # Chroma collection MyData contains fewer than 4 elements.
     # type logger error
     k_db = 1000 if db_type == 'chroma' else k  # k=100 works ok too for
+    # FIXME: For All just go over all dbs instead of a separate db for All
+    if not detect_user_path_changes_every_query and db is not None:
+        # avoid looking at user_path during similarity search db handling,
+        # if already have db and not updating from user_path every query
+        # but if db is None, no db yet loaded (e.g. from prep), so allow user_path to be whatever it was
+        user_path = None
+    db, num_new_sources, new_sources_metadata = make_db(use_openai_embedding=use_openai_embedding,
+                                                        hf_embedding_model=hf_embedding_model,
+                                                        first_para=first_para, text_limit=text_limit, chunk=chunk,
+                                                        chunk_size=chunk_size,
+                                                        langchain_mode=langchain_mode,
+                                                        user_path=user_path,
+                                                        db_type=db_type,
+                                                        load_db_if_exists=load_db_if_exists,
+                                                        db=db,
+                                                        n_jobs=n_jobs,
+                                                        verbose=verbose)
     if db and use_context:
         if isinstance(document_choice, str):
             # support string as well
             document_choice = [document_choice]
+        if not isinstance(db, Chroma) or \
+                len(document_choice) == 0 or \
+                len(document_choice) <= 1 and document_choice[0] == 'All':
             # treat empty list as All for now, not 'None'
             filter_kwargs = {}
+        elif len(document_choice) > 0 and document_choice[0] == 'Only':
+            # Only means All docs, but only will return sources, not LLM response
+            filter_kwargs = {}
         else:
             if len(document_choice) >= 2:
                 or_filter = [{"source": {"$eq": x}} for x in document_choice]
                 filter_kwargs = dict(filter={"$or": or_filter})
+            elif len(document_choice) > 0:
                 one_filter = [{"source": {"$eq": x}} for x in document_choice][0]
                 filter_kwargs = dict(filter=one_filter)
+            else:
+                filter_kwargs = {}
+            if len(document_choice) == 1 and document_choice[0] == 'None':
                 k_db = 1
                 k = 0
         docs_with_score = db.similarity_search_with_score(query, k=k_db, **filter_kwargs)[:k]
         # cut off so no high distance docs/sources considered
         docs = [x[0] for x in docs_with_score if x[1] < cut_distanct]
         scores = [x[1] for x in docs_with_score if x[1] < cut_distanct]
+        if len(scores) > 0 and verbose:
             print("Distance: min: %s max: %s mean: %s median: %s" %
                   (scores[0], scores[-1], np.mean(scores), np.median(scores)), flush=True)
     else:
         docs = []
         scores = []
+    if not docs and use_context and model_name not in non_hf_types:
+        # if HF type and have no docs, can bail out
+        return docs, None, [], False
+    if len(document_choice) > 0 and document_choice[0] == 'Only':
+        # no LLM use
+        return docs, None, [], False
     common_words_file = "data/NGSL_1.2_stats.csv.zip"
     if os.path.isfile(common_words_file):
         num_common = len([x.lower() in set_common for x in reduced_query_words])
         frac_common = num_common / len(reduced_query) if reduced_query else 0
         # FIXME: report to user bad query that uses too many common words
+        if verbose:
+            print("frac_common: %s" % frac_common, flush=True)
+    if len(docs) == 0:
+        # avoid context == in prompt then
+        use_context = False
+    if not use_openai_model and prompt_type not in ['plain'] or model_name in non_hf_types:
+        # instruct-like, rather than few-shot prompt_type='plain' as default
+        # but then sources confuse the model with how inserted among rest of text, so avoid
+        prefix = ""
+        if langchain_mode in ['Disabled', 'ChatLLM', 'LLM'] or not use_context:
+            template = """%s{context}{question}""" % prefix
+        else:
+            template = """%s
+==
+{context}
+==
+{question}""" % prefix
+        prompt = PromptTemplate(
+            # input_variables=["summaries", "question"],
+            input_variables=["context", "question"],
+            template=template,
+        )
+        chain = load_qa_chain(llm, prompt=prompt)
+    else:
+        chain = load_qa_with_sources_chain(llm)
+    if not use_context:
         chain_kwargs = dict(input_documents=[], question=query)
     else:
         chain_kwargs = dict(input_documents=docs, question=query)
+    target = wrapped_partial(chain, chain_kwargs)
+    return docs, target, scores, use_context
+def get_sources_answer(query, answer, scores, show_rank, answer_with_sources, verbose=False):
+    if verbose:
         print("query: %s" % query, flush=True)
         print("answer: %s" % answer['output_text'], flush=True)
+    if len(answer['input_documents']) == 0:
+        extra = ''
+        ret = answer['output_text'] + extra
+        return ret, extra
+    # link
+    answer_sources = [(max(0.0, 1.5 - score) / 1.5, get_url(doc)) for score, doc in
+                      zip(scores, answer['input_documents'])]
+    answer_sources_dict = defaultdict(list)
+    [answer_sources_dict[url].append(score) for score, url in answer_sources]
+    answers_dict = {}
+    for url, scores_url in answer_sources_dict.items():
+        answers_dict[url] = np.max(scores_url)
+    answer_sources = [(score, url) for url, score in answers_dict.items()]
+    answer_sources.sort(key=lambda x: x[0], reverse=True)
+    if show_rank:
+        # answer_sources = ['%d | %s' % (1 + rank, url) for rank, (score, url) in enumerate(answer_sources)]
+        # sorted_sources_urls = "Sources [Rank | Link]:<br>" + "<br>".join(answer_sources)
+        answer_sources = ['%s' % url for rank, (score, url) in enumerate(answer_sources)]
+        sorted_sources_urls = "Ranked Sources:<br>" + "<br>".join(answer_sources)
+    else:
+        answer_sources = ['<li>%.2g | %s</li>' % (score, url) for score, url in answer_sources]
+        sorted_sources_urls = f"{source_prefix}<p><ul>" + "<p>".join(answer_sources)
+        sorted_sources_urls += f"</ul></p>{source_postfix}"
+    if not answer['output_text'].endswith('\n'):
+        answer['output_text'] += '\n'
+    if answer_with_sources:
+        extra = '\n' + sorted_sources_urls
+    else:
+        extra = ''
+    ret = answer['output_text'] + extra
+    return ret, extra
 def chunk_sources(sources, chunk_size=1024):

gradio_runner.py CHANGED Viewed

@@ -9,17 +9,33 @@ import traceback
 import uuid
 import filelock
 import pandas as pd
 import tabulate
 from gradio_themes import H2oTheme, SoftTheme, get_h2o_title, get_simple_title, get_dark_js
 from prompter import Prompter, \
-    prompt_type_to_model_name, prompt_types_strings, inv_prompt_type_to_model_lower, generate_prompt
 from utils import get_githash, flatten_list, zip_data, s3up, clear_torch_cache, get_torch_allocated, system_info_print, \
-    ping, get_short_name, get_url, makedirs
 from generate import get_model, languages_covered, evaluate, eval_func_param_names, score_qa, langchain_modes, \
     inputs_kwargs_list, get_cutoffs, scratch_base_dir
-import gradio as gr
 from apscheduler.schedulers.background import BackgroundScheduler
@@ -27,12 +43,11 @@ def go_gradio(**kwargs):
     allow_api = kwargs['allow_api']
     is_public = kwargs['is_public']
     is_hf = kwargs['is_hf']
-    is_low_mem = kwargs['is_low_mem']
     n_gpus = kwargs['n_gpus']
     admin_pass = kwargs['admin_pass']
     model_state0 = kwargs['model_state0']
     score_model_state0 = kwargs['score_model_state0']
-    queue = True
     dbs = kwargs['dbs']
     db_type = kwargs['db_type']
     visible_langchain_modes = kwargs['visible_langchain_modes']
@@ -41,7 +56,6 @@ def go_gradio(**kwargs):
     enable_sources_list = kwargs['enable_sources_list']
     enable_url_upload = kwargs['enable_url_upload']
     enable_text_upload = kwargs['enable_text_upload']
-    allow_upload = allow_upload_to_user_data or allow_upload_to_my_data
     use_openai_embedding = kwargs['use_openai_embedding']
     hf_embedding_model = kwargs['hf_embedding_model']
     enable_captions = kwargs['enable_captions']
@@ -50,6 +64,8 @@ def go_gradio(**kwargs):
     caption_loader = kwargs['caption_loader']
     # easy update of kwargs needed for evaluate() etc.
     kwargs.update(locals())
     if 'mbart-' in kwargs['model_lower']:
@@ -76,8 +92,8 @@ def go_gradio(**kwargs):
                       """
     else:
         description = more_info
-    description += "If this host is busy, try [12B](https://gpt.h2o.ai), [30B](http://gpt2.h2o.ai), [HF Spaces1 12B](https://huggingface.co/spaces/h2oai/h2ogpt-chatbot) or [HF Spaces2 12B](https://huggingface.co/spaces/h2oai/h2ogpt-chatbot2)<br>"
-    description += """<p>By using h2oGPT, you accept our [Terms of Service](https://github.com/h2oai/h2ogpt/blob/main/tos.md)</p>"""
     if is_hf:
         description += '''<a href="https://huggingface.co/spaces/h2oai/h2ogpt-chatbot?duplicate=true"><img src="https://bit.ly/3gLdBN6" style="white-space: nowrap" alt="Duplicate Space"></a>'''
@@ -95,6 +111,7 @@ def go_gradio(**kwargs):
     else:
         css_code = """footer {visibility: hidden}"""
     css_code += """
 body.dark{#warning {background-color: #555555};}
 #small_btn {
     margin: 0.6em 0em 0.55em 0;
@@ -131,7 +148,19 @@ body.dark{#warning {background-color: #555555};}
         Chatbot._postprocess_chat_messages = _postprocess_chat_messages
-    theme = H2oTheme() if kwargs['h2ocolors'] else SoftTheme()
     demo = gr.Blocks(theme=theme, css=css_code, title="h2oGPT", analytics_enabled=False)
     callback = gr.CSVLogger()
@@ -173,7 +202,11 @@ body.dark{#warning {background-color: #555555};}
         lora_options_state = gr.State([lora_options])
         my_db_state = gr.State([None, None])
         chat_state = gr.State({})
-        docs_state = gr.State(['All'])
         gr.Markdown(f"""
             {get_h2o_title(title) if kwargs['h2ocolors'] else get_simple_title(title)}
@@ -258,10 +291,10 @@ body.dark{#warning {background-color: #555555};}
                     radio_chats = gr.Radio(value=None, label="Saved Chats", visible=True, interactive=True,
                                            type='value')
                     with gr.Row():
-                        clear_chat_btn = gr.Button(value="Clear Chat", visible=True)
-                        export_chats_btn = gr.Button(value="Export Chats to Download")
-                        remove_chat_btn = gr.Button(value="Remove Selected Chat", visible=True)
-                        add_to_chats_btn = gr.Button("Import Chats from Upload")
                     with gr.Row():
                         chats_file = gr.File(interactive=False, label="Download Exported Chats")
                         chatsup_output = gr.File(label="Upload Chat File(s)",
@@ -269,7 +302,7 @@ body.dark{#warning {background-color: #555555};}
                                                  file_count='multiple',
                                                  elem_id="warning", elem_classes="feedback")
                 with gr.TabItem("Data Source"):
-                    langchain_readme = get_url('https://github.com/h2oai/h2ogpt/blob/main/README_LangChain.md',
                                                from_str=True)
                     gr.HTML(value=f"""LangChain Support Disabled<p>
                             Run:<p>
@@ -302,7 +335,7 @@ body.dark{#warning {background-color: #555555};}
                     with data_row2:
                         with gr.Column(scale=50):
                             document_choice = gr.Dropdown(docs_state.value,
-                                                          label="Choose Subset of Doc(s) in Collection [click get to update]",
                                                           value=docs_state.value[0],
                                                           interactive=True,
                                                           multiselect=True,
@@ -312,6 +345,8 @@ body.dark{#warning {background-color: #555555};}
                                                         ).style(full_width=False, size='sm')
                             show_sources_btn = gr.Button(value="Show Sources",
                                                          ).style(full_width=False, size='sm')
                     # import control
                     if kwargs['langchain_mode'] != 'Disabled':
@@ -375,7 +410,7 @@ body.dark{#warning {background-color: #555555};}
                     with sources_row3:
                         with gr.Column(scale=1):
                             file_source = gr.File(interactive=False,
-                                                  label="Download File with Sources [click get to make file]")
                         with gr.Column(scale=2):
                             pass
                     sources_row = gr.Row(visible=kwargs['langchain_mode'] != 'Disabled' and enable_sources_list).style(
@@ -411,14 +446,24 @@ body.dark{#warning {background-color: #555555};}
                             )
                             # FIXME: https://github.com/h2oai/h2ogpt/issues/106
                             if os.getenv('TESTINGFAIL'):
-                                max_beams = 8 if not (is_low_mem or is_public) else 1
                             else:
                                 max_beams = 1
                             num_beams = gr.Slider(minimum=1, maximum=max_beams, step=1,
                                                   value=min(max_beams, kwargs['num_beams']), label="Beams",
                                                   info="Number of searches for optimal overall probability.  "
                                                        "Uses more GPU memory/compute")
-                            max_max_new_tokens = 2048 if not is_low_mem else kwargs['max_new_tokens']
                             max_new_tokens = gr.Slider(
                                 minimum=1, maximum=max_max_new_tokens, step=1,
                                 value=min(max_max_new_tokens, kwargs['max_new_tokens']), label="Max output length",
@@ -450,11 +495,19 @@ body.dark{#warning {background-color: #555555};}
                                                  visible=not is_public)
                             chat = gr.components.Checkbox(label="Chat mode", value=kwargs['chat'],
                                                           visible=not is_public)
                 with gr.TabItem("Models"):
-                    load_msg = "Load-Unload Model/LORA" if not is_public \
                         else "LOAD-UNLOAD DISABLED FOR HOSTED DEMO"
-                    load_msg2 = "Load-Unload Model/LORA 2" if not is_public \
                         else "LOAD-UNLOAD DISABLED FOR HOSTED DEMO 2"
                     compare_checkbox = gr.components.Checkbox(label="Compare Mode",
                                                               value=False, visible=not is_public)
@@ -468,7 +521,7 @@ body.dark{#warning {background-color: #555555};}
                                     lora_choice = gr.Dropdown(lora_options_state.value[0], label="Choose LORA",
                                                               value=kwargs['lora_weights'], visible=kwargs['show_lora'])
                                 with gr.Column(scale=1):
-                                    load_model_button = gr.Button(load_msg)
                                     model_load8bit_checkbox = gr.components.Checkbox(
                                         label="Load 8-bit [requires support]",
                                         value=kwargs['load_8bit'])
@@ -476,19 +529,12 @@ body.dark{#warning {background-color: #555555};}
                                         label="Choose Devices [If not Checked, use all GPUs]",
                                         value=kwargs['infer_devices'])
                                     model_gpu = gr.Dropdown(n_gpus_list,
-                                                            label="GPU ID 2 [-1 = all GPUs, if Choose is enabled]",
                                                             value=kwargs['gpu_id'])
                                     model_used = gr.Textbox(label="Current Model", value=kwargs['base_model'],
                                                             interactive=False)
                                     lora_used = gr.Textbox(label="Current LORA", value=kwargs['lora_weights'],
                                                            visible=kwargs['show_lora'], interactive=False)
-                            with gr.Row():
-                                with gr.Column(scale=50):
-                                    new_model = gr.Textbox(label="New Model HF name/path")
-                                    new_lora = gr.Textbox(label="New LORA HF name/path", visible=kwargs['show_lora'])
-                                with gr.Column(scale=1):
-                                    add_model_button = gr.Button("Add new model name")
-                                    add_lora_button = gr.Button("Add new LORA name", visible=kwargs['show_lora'])
                         col_model2 = gr.Column(visible=False)
                         with col_model2:
                             with gr.Row():
@@ -499,7 +545,7 @@ body.dark{#warning {background-color: #555555};}
                                                                value=no_lora_str,
                                                                visible=kwargs['show_lora'])
                                 with gr.Column(scale=1):
-                                    load_model_button2 = gr.Button(load_msg2)
                                     model_load8bit_checkbox2 = gr.components.Checkbox(
                                         label="Load 8-bit 2 [requires support]",
                                         value=kwargs['load_8bit'])
@@ -508,12 +554,22 @@ body.dark{#warning {background-color: #555555};}
                                         value=kwargs[
                                             'infer_devices'])
                                     model_gpu2 = gr.Dropdown(n_gpus_list,
-                                                             label="GPU ID [-1 = all GPUs, if choose is enabled]",
                                                              value=kwargs['gpu_id'])
                                     # no model/lora loaded ever in model2 by default
                                     model_used2 = gr.Textbox(label="Current Model 2", value=no_model_str)
                                     lora_used2 = gr.Textbox(label="Current LORA 2", value=no_lora_str,
                                                             visible=kwargs['show_lora'])
                 with gr.TabItem("System"):
                     admin_row = gr.Row()
                     with admin_row:
@@ -530,7 +586,7 @@ body.dark{#warning {background-color: #555555};}
                             with gr.Row():
                                 zip_btn = gr.Button("Zip")
                                 zip_text = gr.Textbox(label="Zip file name", interactive=False)
-                                file_output = gr.File(interactive=False)
                             with gr.Row():
                                 s3up_btn = gr.Button("S3UP")
                                 s3up_text = gr.Textbox(label='S3UP result', interactive=False)
@@ -542,7 +598,7 @@ body.dark{#warning {background-color: #555555};}
                     description += """<i><li>Conversations may be used to improve h2oGPT.  Do not share sensitive information.</i></li>"""
                     if 'h2ogpt-research' in kwargs['base_model']:
                         description += """<i><li>Research demonstration only, not used for commercial purposes.</i></li>"""
-                    description += """<i><li>By using h2oGPT, you accept our <a href="https://github.com/h2oai/h2ogpt/blob/main/tos.md">Terms of Service</a></i></li></ul></p>"""
                     gr.Markdown(value=description, show_label=False, interactive=False)
         # Get flagged data
@@ -633,24 +689,37 @@ body.dark{#warning {background-color: #555555};}
                                api_name='add_txt_to_my' if allow_api else None) \
             .then(clear_textbox, outputs=user_text_text, queue=queue)
-        get_sources1 = functools.partial(get_sources, dbs=dbs)
         # if change collection source, must clear doc selections from it to avoid inconsistency
         def clear_doc_choice():
-            return gr.Dropdown.update(choices=['All'], value=['All'])
         langchain_mode.change(clear_doc_choice, inputs=None, outputs=document_choice)
         def update_dropdown(x):
-            return gr.Dropdown.update(choices=x, value='All')
-        show_sources1 = functools.partial(get_source_files_given_langchain_mode, dbs=dbs)
         get_sources_btn.click(get_sources1, inputs=[my_db_state, langchain_mode], outputs=[file_source, docs_state],
                               queue=queue,
                               api_name='get_sources' if allow_api else None) \
             .then(fn=update_dropdown, inputs=docs_state, outputs=document_choice)
         # show button, else only show when add.  Could add to above get_sources for download/dropdown, but bit much maybe
-        show_sources_btn.click(fn=show_sources1, inputs=[my_db_state, langchain_mode], outputs=sources_text)
         def check_admin_pass(x):
             return gr.update(visible=x == admin_pass)
@@ -661,10 +730,6 @@ body.dark{#warning {background-color: #555555};}
         admin_btn.click(check_admin_pass, inputs=admin_pass_textbox, outputs=system_row, queue=False) \
             .then(close_admin, inputs=admin_pass_textbox, outputs=admin_row, queue=False)
-        # Get inputs to evaluate()
-        # don't deepcopy, can contain model itself
-        all_kwargs = kwargs.copy()
-        all_kwargs.update(locals())
         inputs_list = get_inputs_list(all_kwargs, kwargs['model_lower'])
         from functools import partial
         kwargs_evaluate = {k: v for k, v in all_kwargs.items() if k in inputs_kwargs_list}
@@ -714,7 +779,10 @@ body.dark{#warning {background-color: #555555};}
             """ Similar to user() """
             args_list = list(args)
-            max_length_tokenize = 512 if is_low_mem else 2048
             cutoff_len = max_length_tokenize * 4  # restrict deberta related to max for LLM
             smodel = score_model_state0[0]
             stokenizer = score_model_state0[1]
@@ -811,6 +879,8 @@ body.dark{#warning {background-color: #555555};}
                 # e.g. when user just hits enter in textbox,
                 # else will have <human>: <bot>: on single line, which seems to be "ok" for LLM but not usual
                 user_message1 = '\n'
             history = args_list[-1]
             if undo and history:
@@ -830,6 +900,43 @@ body.dark{#warning {background-color: #555555};}
                 # FIXME: compare, same history for now
                 return history + [[user_message1, None]]
         def bot(*args, retry=False):
             """
             bot that consumes history for user input
@@ -861,47 +968,15 @@ body.dark{#warning {background-color: #555555};}
                 history = []
                 yield history, ''
                 return
-            # ensure output will be unique to models
-            _, _, _, max_prompt_length = get_cutoffs(is_low_mem, for_context=True)
-            history = copy.deepcopy(history)
             instruction1 = history[-1][0]
             if not instruction1:
                 # reject empty query, can sometimes go nuts
                 history = []
                 yield history, ''
                 return
-            context1 = ''
-            if max_prompt_length is not None and langchain_mode1 not in ['LLM']:
-                prompt_type1 = args_list[eval_func_param_names.index('prompt_type')]
-                chat1 = args_list[eval_func_param_names.index('chat')]
-                context1 = ''
-                # - 1 below because current instruction already in history from user()
-                for histi in range(0, len(history) - 1):
-                    data_point = dict(instruction=history[histi][0], input='', output=history[histi][1])
-                    prompt, pre_response, terminate_response, chat_sep = generate_prompt(data_point, prompt_type1,
-                                                                                         chat1, reduced=True)
-                    # md -> back to text, maybe not super important if model trained enough
-                    if not kwargs['keep_sources_in_context']:
-                        from gpt_langchain import source_prefix, source_postfix
-                        import re
-                        prompt = re.sub(f'{re.escape(source_prefix)}.*?{re.escape(source_postfix)}', '', prompt,
-                                        flags=re.DOTALL)
-                        if prompt.endswith('\n<p>'):
-                            prompt = prompt[:-4]
-                    prompt = prompt.replace('<br>', chat_sep)
-                    if not prompt.endswith(chat_sep):
-                        prompt += chat_sep
-                    # most recent first, add older if can
-                    # only include desired chat history
-                    if len(prompt + context1) > max_prompt_length:
-                        break
-                    context1 = prompt + context1
-                _, pre_response, terminate_response, chat_sep = generate_prompt({}, prompt_type1, chat1,
-                                                                                reduced=True)
-                if context1 and not context1.endswith(chat_sep):
-                    context1 += chat_sep  # ensure if terminates abruptly, then human continues on next line
             args_list[0] = instruction1  # override original instruction with history from user
             args_list[2] = context1
             fun1 = partial(evaluate,
@@ -909,8 +984,11 @@ body.dark{#warning {background-color: #555555};}
                            my_db_state1,
                            **kwargs_evaluate)
             try:
-                for output in fun1(*tuple(args_list)):
-                    bot_message = output
                     history[-1][1] = bot_message
                     yield history, ''
             except StopIteration:
@@ -1067,11 +1145,11 @@ body.dark{#warning {background-color: #555555};}
                 if len(stepy) != 2:
                     # something off
                     return False
-                questionx = stepx[0].replace('<p>', '').replace('</p>', '')
-                answerx = stepx[1].replace('<p>', '').replace('</p>', '')
-                questiony = stepy[0].replace('<p>', '').replace('</p>', '')
-                answery = stepy[1].replace('<p>', '').replace('</p>', '')
                 if questionx != questiony or answerx != answery:
                     return False
@@ -1221,7 +1299,9 @@ body.dark{#warning {background-color: #555555};}
                 lora_weights = ''
             all_kwargs1['lora_weights'] = lora_weights.strip()
-            model1, tokenizer1, device1 = get_model(**all_kwargs1)
             clear_torch_cache()
             if kwargs['debug']:
@@ -1242,7 +1322,7 @@ body.dark{#warning {background-color: #555555};}
         chatbot_update_args = dict(fn=chatbot_list, inputs=[text_output, model_used], outputs=text_output)
         nochat_update_args = dict(fn=chatbot_list, inputs=[text_output_nochat, model_used], outputs=text_output_nochat)
         if not is_public:
-            load_model_event = load_model_button.click(**load_model_args) \
                 .then(**prompt_update_args) \
                 .then(**chatbot_update_args) \
                 .then(**nochat_update_args) \
@@ -1255,7 +1335,8 @@ body.dark{#warning {background-color: #555555};}
         prompt_update_args2 = dict(fn=dropdown_prompt_type_list, inputs=prompt_type2, outputs=prompt_type2)
         chatbot_update_args2 = dict(fn=chatbot_list, inputs=[text_output2, model_used2], outputs=text_output2)
         if not is_public:
-            load_model_event2 = load_model_button2.click(**load_model_args2) \
                 .then(**prompt_update_args2) \
                 .then(**chatbot_update_args2) \
                 .then(clear_torch_cache)
@@ -1331,6 +1412,27 @@ body.dark{#warning {background-color: #555555};}
                                 submit_event3d, submit_event3f,
                                 submit_event_nochat],
                        queue=False, api_name='stop' if allow_api else None).then(clear_torch_cache, queue=False)
         demo.load(None, None, None, _js=get_dark_js() if kwargs['h2ocolors'] else None)
     demo.queue(concurrency_count=kwargs['concurrency_count'], api_open=kwargs['api_open'])
@@ -1339,7 +1441,7 @@ body.dark{#warning {background-color: #555555};}
     scheduler = BackgroundScheduler()
     scheduler.add_job(func=clear_torch_cache, trigger="interval", seconds=20)
     if is_public and \
-            kwargs['base_model'] not in ['gptj', 'llama']:
         # FIXME: disable for gptj, langchain or gpt4all modify print itself
         # FIXME: and any multi-threaded/async print will enter model output!
         scheduler.add_job(func=ping, trigger="interval", seconds=60)
@@ -1348,14 +1450,15 @@ body.dark{#warning {background-color: #555555};}
     # import control
     if kwargs['langchain_mode'] == 'Disabled' and \
             os.environ.get("TEST_LANGCHAIN_IMPORT") and \
-            kwargs['base_model'] not in ['gptj', 'llama']:
         assert 'gpt_langchain' not in sys.modules, "Dev bug, import of langchain when should not have"
         assert 'langchain' not in sys.modules, "Dev bug, import of langchain when should not have"
     demo.launch(share=kwargs['share'], server_name="0.0.0.0", show_error=True,
                 favicon_path=favicon_path, prevent_thread_lock=True,
                 auth=kwargs['auth'])
-    print("Started GUI", flush=True)
     if kwargs['block_gradio_exit']:
         demo.block_thread()
@@ -1384,7 +1487,7 @@ def get_inputs_list(inputs_dict, model_lower):
     return inputs_list
-def get_sources(db1, langchain_mode, dbs=None):
     if langchain_mode in ['ChatLLM', 'LLM']:
         source_files_added = "NA"
         source_list = []
@@ -1407,7 +1510,7 @@ def get_sources(db1, langchain_mode, dbs=None):
     sources_file = 'sources_%s_%s' % (langchain_mode, str(uuid.uuid4()))
     with open(sources_file, "wt") as f:
         f.write(source_files_added)
-    source_list = ['All'] + source_list
     return sources_file, source_list
@@ -1471,7 +1574,7 @@ def _update_user_db(file, db1, x, y, dbs=None, db_type=None, langchain_mode='Use
         if langchain_mode == 'MyData':
             if db1[0] is not None:
                 # then add
-                add_to_db(db1[0], sources, db_type=db_type)
             else:
                 assert len(db1) == 2 and db1[1] is None, "Bad MyData db: %s" % db1
                 # then create
@@ -1486,13 +1589,13 @@ def _update_user_db(file, db1, x, y, dbs=None, db_type=None, langchain_mode='Use
                                 hf_embedding_model=hf_embedding_model)
                 if db1[0] is None:
                     db1[1] = None
-            source_files_added = get_source_files(db1[0], exceptions=exceptions)
             return db1, x, y, source_files_added
         else:
             persist_directory = 'db_dir_%s' % langchain_mode
             if langchain_mode in dbs and dbs[langchain_mode] is not None:
                 # then add
-                add_to_db(dbs[langchain_mode], sources, db_type=db_type)
             else:
                 # then create
                 db = get_db(sources, use_openai_embedding=use_openai_embedding,
@@ -1504,11 +1607,11 @@ def _update_user_db(file, db1, x, y, dbs=None, db_type=None, langchain_mode='Use
             # NOTE we do not return db, because function call always same code path
             # return dbs[langchain_mode], x, y
             # db in this code path is updated in place
-            source_files_added = get_source_files(dbs[langchain_mode], exceptions=exceptions)
             return x, y, source_files_added
-def get_source_files_given_langchain_mode(db1, langchain_mode='UserData', dbs=None):
     with filelock.FileLock("db_%s.lock" % langchain_mode.replace(' ', '_')):
         if langchain_mode in ['wiki_full']:
             # NOTE: avoid showing full wiki.  Takes about 30 seconds over about 90k entries, but not useful for now
@@ -1519,17 +1622,31 @@ def get_source_files_given_langchain_mode(db1, langchain_mode='UserData', dbs=No
             db = dbs[langchain_mode]
         else:
             db = None
-    return get_source_files(db, exceptions=None)
-def get_source_files(db, exceptions=None):
     if exceptions is None:
         exceptions = []
-    if db is not None:
-        metadatas = db.get()['metadatas']
     else:
-        metadatas = []
     # below automatically de-dups
     from gpt_langchain import get_url
@@ -1558,28 +1675,28 @@ def get_source_files(db, exceptions=None):
         <html>
           <body>
             <p>
-               Sources: <br>
             </p>
                <div style="overflow-y: auto;height:400px">
-               {0}
                {1}
                </div>
           </body>
         </html>
-        """.format(source_files_added, exceptions_html)
     elif metadatas:
         source_files_added = """\
         <html>
           <body>
             <p>
-               Sources: <br>
             </p>
                <div style="overflow-y: auto;height:400px">
-               {0}
                </div>
           </body>
         </html>
-        """.format(source_files_added)
     elif exceptions_html:
         source_files_added = """\
         <html>
@@ -1594,6 +1711,31 @@ def get_source_files(db, exceptions=None):
         </html>
         """.format(exceptions_html)
     else:
-        source_files_added = ""
     return source_files_added

 import uuid
 import filelock
 import pandas as pd
+import requests
 import tabulate
+# This is a hack to prevent Gradio from phoning home when it gets imported
+os.environ['GRADIO_ANALYTICS_ENABLED'] = 'False'
+def my_get(url, **kwargs):
+    print('Gradio HTTP request redirected to localhost :)', flush=True)
+    kwargs.setdefault('allow_redirects', True)
+    return requests.api.request('get', 'http://127.0.0.1/', **kwargs)
+original_get = requests.get
+requests.get = my_get
+import gradio as gr
+requests.get = original_get
 from gradio_themes import H2oTheme, SoftTheme, get_h2o_title, get_simple_title, get_dark_js
 from prompter import Prompter, \
+    prompt_type_to_model_name, prompt_types_strings, inv_prompt_type_to_model_lower, generate_prompt, non_hf_types
 from utils import get_githash, flatten_list, zip_data, s3up, clear_torch_cache, get_torch_allocated, system_info_print, \
+    ping, get_short_name, get_url, makedirs, get_kwargs
 from generate import get_model, languages_covered, evaluate, eval_func_param_names, score_qa, langchain_modes, \
     inputs_kwargs_list, get_cutoffs, scratch_base_dir
 from apscheduler.schedulers.background import BackgroundScheduler
     allow_api = kwargs['allow_api']
     is_public = kwargs['is_public']
     is_hf = kwargs['is_hf']
+    memory_restriction_level = kwargs['memory_restriction_level']
     n_gpus = kwargs['n_gpus']
     admin_pass = kwargs['admin_pass']
     model_state0 = kwargs['model_state0']
     score_model_state0 = kwargs['score_model_state0']
     dbs = kwargs['dbs']
     db_type = kwargs['db_type']
     visible_langchain_modes = kwargs['visible_langchain_modes']
     enable_sources_list = kwargs['enable_sources_list']
     enable_url_upload = kwargs['enable_url_upload']
     enable_text_upload = kwargs['enable_text_upload']
     use_openai_embedding = kwargs['use_openai_embedding']
     hf_embedding_model = kwargs['hf_embedding_model']
     enable_captions = kwargs['enable_captions']
     caption_loader = kwargs['caption_loader']
     # easy update of kwargs needed for evaluate() etc.
+    queue = True
+    allow_upload = allow_upload_to_user_data or allow_upload_to_my_data
     kwargs.update(locals())
     if 'mbart-' in kwargs['model_lower']:
                       """
     else:
         description = more_info
+    description += "If this host is busy, try [12B](https://gpt.h2o.ai), [Falcon 40B](http://falcon.h2o.ai), [HF Spaces1 12B](https://huggingface.co/spaces/h2oai/h2ogpt-chatbot) or [HF Spaces2 12B](https://huggingface.co/spaces/h2oai/h2ogpt-chatbot2)<br>"
+    description += """<p>By using h2oGPT, you accept our [Terms of Service](https://github.com/h2oai/h2ogpt/blob/main/docs/tos.md)</p>"""
     if is_hf:
         description += '''<a href="https://huggingface.co/spaces/h2oai/h2ogpt-chatbot?duplicate=true"><img src="https://bit.ly/3gLdBN6" style="white-space: nowrap" alt="Duplicate Space"></a>'''
     else:
         css_code = """footer {visibility: hidden}"""
     css_code += """
+@import url('https://fonts.googleapis.com/css2?family=Source+Sans+Pro:wght@400;600&display=swap');
 body.dark{#warning {background-color: #555555};}
 #small_btn {
     margin: 0.6em 0em 0.55em 0;
         Chatbot._postprocess_chat_messages = _postprocess_chat_messages
+    if kwargs['gradio_offline_level'] >= 0:
+        # avoid GoogleFont that pulls from internet
+        if kwargs['gradio_offline_level'] == 1:
+            # front end would still have to download fonts or have cached it at some point
+            base_font = 'Source Sans Pro'
+        else:
+            base_font = 'Helvetica'
+        theme_kwargs = dict(font=(base_font, 'ui-sans-serif', 'system-ui', 'sans-serif'),
+                            font_mono=('IBM Plex Mono', 'ui-monospace', 'Consolas', 'monospace'))
+    else:
+        theme_kwargs = dict()
+    theme = H2oTheme(**theme_kwargs) if kwargs['h2ocolors'] else SoftTheme(**theme_kwargs)
     demo = gr.Blocks(theme=theme, css=css_code, title="h2oGPT", analytics_enabled=False)
     callback = gr.CSVLogger()
         lora_options_state = gr.State([lora_options])
         my_db_state = gr.State([None, None])
         chat_state = gr.State({})
+        # make user default first and default choice, dedup
+        docs_state00 = kwargs['document_choice'] + ['All', 'Only', 'None']
+        docs_state0 = []
+        [docs_state0.append(x) for x in docs_state00 if x not in docs_state0]
+        docs_state = gr.State(docs_state0)  # first is chosen as default
         gr.Markdown(f"""
             {get_h2o_title(title) if kwargs['h2ocolors'] else get_simple_title(title)}
                     radio_chats = gr.Radio(value=None, label="Saved Chats", visible=True, interactive=True,
                                            type='value')
                     with gr.Row():
+                        clear_chat_btn = gr.Button(value="Clear Chat", visible=True).style(size='sm')
+                        export_chats_btn = gr.Button(value="Export Chats to Download").style(size='sm')
+                        remove_chat_btn = gr.Button(value="Remove Selected Chat", visible=True).style(size='sm')
+                        add_to_chats_btn = gr.Button("Import Chats from Upload").style(size='sm')
                     with gr.Row():
                         chats_file = gr.File(interactive=False, label="Download Exported Chats")
                         chatsup_output = gr.File(label="Upload Chat File(s)",
                                                  file_count='multiple',
                                                  elem_id="warning", elem_classes="feedback")
                 with gr.TabItem("Data Source"):
+                    langchain_readme = get_url('https://github.com/h2oai/h2ogpt/blob/main/docs/README_LangChain.md',
                                                from_str=True)
                     gr.HTML(value=f"""LangChain Support Disabled<p>
                             Run:<p>
                     with data_row2:
                         with gr.Column(scale=50):
                             document_choice = gr.Dropdown(docs_state.value,
+                                                          label="Choose Subset of Doc(s) in Collection [click get sources to update]",
                                                           value=docs_state.value[0],
                                                           interactive=True,
                                                           multiselect=True,
                                                         ).style(full_width=False, size='sm')
                             show_sources_btn = gr.Button(value="Show Sources",
                                                          ).style(full_width=False, size='sm')
+                            refresh_sources_btn = gr.Button(value="Refresh Sources",
+                                                            ).style(full_width=False, size='sm')
                     # import control
                     if kwargs['langchain_mode'] != 'Disabled':
                     with sources_row3:
                         with gr.Column(scale=1):
                             file_source = gr.File(interactive=False,
+                                                  label="Download File w/Sources [click get sources to make file]")
                         with gr.Column(scale=2):
                             pass
                     sources_row = gr.Row(visible=kwargs['langchain_mode'] != 'Disabled' and enable_sources_list).style(
                             )
                             # FIXME: https://github.com/h2oai/h2ogpt/issues/106
                             if os.getenv('TESTINGFAIL'):
+                                max_beams = 8 if not (memory_restriction_level or is_public) else 1
                             else:
                                 max_beams = 1
                             num_beams = gr.Slider(minimum=1, maximum=max_beams, step=1,
                                                   value=min(max_beams, kwargs['num_beams']), label="Beams",
                                                   info="Number of searches for optimal overall probability.  "
                                                        "Uses more GPU memory/compute")
+                            # FIXME: 2048 should be tokenizer.model_max_length, but may not even have model yet
+                            if kwargs['max_new_tokens']:
+                                max_max_new_tokens = kwargs['max_new_tokens']
+                            elif memory_restriction_level == 1:
+                                max_max_new_tokens = 768
+                            elif memory_restriction_level == 2:
+                                max_max_new_tokens = 512
+                            elif memory_restriction_level >= 3:
+                                max_max_new_tokens = 256
+                            else:
+                                max_max_new_tokens = 2048
                             max_new_tokens = gr.Slider(
                                 minimum=1, maximum=max_max_new_tokens, step=1,
                                 value=min(max_max_new_tokens, kwargs['max_new_tokens']), label="Max output length",
                                                  visible=not is_public)
                             chat = gr.components.Checkbox(label="Chat mode", value=kwargs['chat'],
                                                           visible=not is_public)
+                            count_chat_tokens_btn = gr.Button(value="Count Chat Tokens", visible=not is_public)
+                            chat_token_count = gr.Textbox(label="Chat Token Count", value=None,
+                                                          visible=not is_public, interactive=False)
+                            top_k_docs = gr.Slider(minimum=0, maximum=20, step=1,
+                                                   value=kwargs['top_k_docs'],
+                                                   label="Number of document chunks",
+                                                   info="For LangChain",
+                                                   visible=not is_public)
                 with gr.TabItem("Models"):
+                    load_msg = "Load-Unload Model/LORA [unload works if did not use --base_model]" if not is_public \
                         else "LOAD-UNLOAD DISABLED FOR HOSTED DEMO"
+                    load_msg2 = "Load-Unload Model/LORA 2 [unload works if did not use --base_model]" if not is_public \
                         else "LOAD-UNLOAD DISABLED FOR HOSTED DEMO 2"
                     compare_checkbox = gr.components.Checkbox(label="Compare Mode",
                                                               value=False, visible=not is_public)
                                     lora_choice = gr.Dropdown(lora_options_state.value[0], label="Choose LORA",
                                                               value=kwargs['lora_weights'], visible=kwargs['show_lora'])
                                 with gr.Column(scale=1):
+                                    load_model_button = gr.Button(load_msg).style(full_width=False, size='sm')
                                     model_load8bit_checkbox = gr.components.Checkbox(
                                         label="Load 8-bit [requires support]",
                                         value=kwargs['load_8bit'])
                                         label="Choose Devices [If not Checked, use all GPUs]",
                                         value=kwargs['infer_devices'])
                                     model_gpu = gr.Dropdown(n_gpus_list,
+                                                            label="GPU ID [-1 = all GPUs, if Choose is enabled]",
                                                             value=kwargs['gpu_id'])
                                     model_used = gr.Textbox(label="Current Model", value=kwargs['base_model'],
                                                             interactive=False)
                                     lora_used = gr.Textbox(label="Current LORA", value=kwargs['lora_weights'],
                                                            visible=kwargs['show_lora'], interactive=False)
                         col_model2 = gr.Column(visible=False)
                         with col_model2:
                             with gr.Row():
                                                                value=no_lora_str,
                                                                visible=kwargs['show_lora'])
                                 with gr.Column(scale=1):
+                                    load_model_button2 = gr.Button(load_msg2).style(full_width=False, size='sm')
                                     model_load8bit_checkbox2 = gr.components.Checkbox(
                                         label="Load 8-bit 2 [requires support]",
                                         value=kwargs['load_8bit'])
                                         value=kwargs[
                                             'infer_devices'])
                                     model_gpu2 = gr.Dropdown(n_gpus_list,
+                                                             label="GPU ID 2 [-1 = all GPUs, if choose is enabled]",
                                                              value=kwargs['gpu_id'])
                                     # no model/lora loaded ever in model2 by default
                                     model_used2 = gr.Textbox(label="Current Model 2", value=no_model_str)
                                     lora_used2 = gr.Textbox(label="Current LORA 2", value=no_lora_str,
                                                             visible=kwargs['show_lora'])
+                    with gr.Row():
+                        with gr.Column(scale=50):
+                            new_model = gr.Textbox(label="New Model HF name/path")
+                        with gr.Row():
+                            add_model_button = gr.Button("Add new model name").style(full_width=False, size='sm')
+                        with gr.Column(scale=50):
+                            new_lora = gr.Textbox(label="New LORA HF name/path", visible=kwargs['show_lora'])
+                        with gr.Row():
+                            add_lora_button = gr.Button("Add new LORA name", visible=kwargs['show_lora']).style(
+                                full_width=False, size='sm')
                 with gr.TabItem("System"):
                     admin_row = gr.Row()
                     with admin_row:
                             with gr.Row():
                                 zip_btn = gr.Button("Zip")
                                 zip_text = gr.Textbox(label="Zip file name", interactive=False)
+                                file_output = gr.File(interactive=False, label="Zip file to Download")
                             with gr.Row():
                                 s3up_btn = gr.Button("S3UP")
                                 s3up_text = gr.Textbox(label='S3UP result', interactive=False)
                     description += """<i><li>Conversations may be used to improve h2oGPT.  Do not share sensitive information.</i></li>"""
                     if 'h2ogpt-research' in kwargs['base_model']:
                         description += """<i><li>Research demonstration only, not used for commercial purposes.</i></li>"""
+                    description += """<i><li>By using h2oGPT, you accept our <a href="https://github.com/h2oai/h2ogpt/blob/main/docs/tos.md">Terms of Service</a></i></li></ul></p>"""
                     gr.Markdown(value=description, show_label=False, interactive=False)
         # Get flagged data
                                api_name='add_txt_to_my' if allow_api else None) \
             .then(clear_textbox, outputs=user_text_text, queue=queue)
+        get_sources1 = functools.partial(get_sources, dbs=dbs, docs_state0=docs_state0)
         # if change collection source, must clear doc selections from it to avoid inconsistency
         def clear_doc_choice():
+            return gr.Dropdown.update(choices=docs_state0, value=[docs_state0[0]])
         langchain_mode.change(clear_doc_choice, inputs=None, outputs=document_choice)
         def update_dropdown(x):
+            return gr.Dropdown.update(choices=x, value=[docs_state0[0]])
         get_sources_btn.click(get_sources1, inputs=[my_db_state, langchain_mode], outputs=[file_source, docs_state],
                               queue=queue,
                               api_name='get_sources' if allow_api else None) \
             .then(fn=update_dropdown, inputs=docs_state, outputs=document_choice)
         # show button, else only show when add.  Could add to above get_sources for download/dropdown, but bit much maybe
+        show_sources1 = functools.partial(get_source_files_given_langchain_mode, dbs=dbs)
+        show_sources_btn.click(fn=show_sources1, inputs=[my_db_state, langchain_mode], outputs=sources_text,
+                               api_name='show_sources' if allow_api else None)
+        # Get inputs to evaluate() and make_db()
+        # don't deepcopy, can contain model itself
+        all_kwargs = kwargs.copy()
+        all_kwargs.update(locals())
+        refresh_sources1 = functools.partial(update_and_get_source_files_given_langchain_mode,
+                                             **get_kwargs(update_and_get_source_files_given_langchain_mode,
+                                                          exclude_names=['db1', 'langchain_mode'],
+                                                          **all_kwargs))
+        refresh_sources_btn.click(fn=refresh_sources1, inputs=[my_db_state, langchain_mode], outputs=sources_text,
+                                  api_name='refresh_sources' if allow_api else None)
         def check_admin_pass(x):
             return gr.update(visible=x == admin_pass)
         admin_btn.click(check_admin_pass, inputs=admin_pass_textbox, outputs=system_row, queue=False) \
             .then(close_admin, inputs=admin_pass_textbox, outputs=admin_row, queue=False)
         inputs_list = get_inputs_list(all_kwargs, kwargs['model_lower'])
         from functools import partial
         kwargs_evaluate = {k: v for k, v in all_kwargs.items() if k in inputs_kwargs_list}
             """ Similar to user() """
             args_list = list(args)
+            if memory_restriction_level > 0:
+                max_length_tokenize = 768 - 256 if memory_restriction_level <= 2 else 512 - 256
+            else:
+                max_length_tokenize = 2048 - 256
             cutoff_len = max_length_tokenize * 4  # restrict deberta related to max for LLM
             smodel = score_model_state0[0]
             stokenizer = score_model_state0[1]
                 # e.g. when user just hits enter in textbox,
                 # else will have <human>: <bot>: on single line, which seems to be "ok" for LLM but not usual
                 user_message1 = '\n'
+            # ensure good visually, else markdown ignores multiple \n
+            user_message1 = user_message1.replace('\n', '<br>')
             history = args_list[-1]
             if undo and history:
                 # FIXME: compare, same history for now
                 return history + [[user_message1, None]]
+        def history_to_context(history, langchain_mode1, prompt_type1, chat1):
+            # ensure output will be unique to models
+            # FIXME: hard-coded 2048 implicitly passed:
+            _, _, _, max_prompt_length = get_cutoffs(memory_restriction_level, for_context=True)
+            history = copy.deepcopy(history)
+            context1 = ''
+            if max_prompt_length is not None and langchain_mode1 not in ['LLM']:
+                context1 = ''
+                # - 1 below because current instruction already in history from user()
+                for histi in range(0, len(history) - 1):
+                    data_point = dict(instruction=history[histi][0], input='', output=history[histi][1])
+                    prompt, pre_response, terminate_response, chat_sep = generate_prompt(data_point, prompt_type1,
+                                                                                         chat1, reduced=True)
+                    # md -> back to text, maybe not super important if model trained enough
+                    if not kwargs['keep_sources_in_context']:
+                        from gpt_langchain import source_prefix, source_postfix
+                        import re
+                        prompt = re.sub(f'{re.escape(source_prefix)}.*?{re.escape(source_postfix)}', '', prompt,
+                                        flags=re.DOTALL)
+                        if prompt.endswith('\n<p>'):
+                            prompt = prompt[:-4]
+                    prompt = prompt.replace('<br>', chat_sep)
+                    if not prompt.endswith(chat_sep):
+                        prompt += chat_sep
+                    # most recent first, add older if can
+                    # only include desired chat history
+                    if len(prompt + context1) > max_prompt_length:
+                        break
+                    context1 = prompt + context1
+                _, pre_response, terminate_response, chat_sep = generate_prompt({}, prompt_type1, chat1,
+                                                                                reduced=True)
+                if context1 and not context1.endswith(chat_sep):
+                    context1 += chat_sep  # ensure if terminates abruptly, then human continues on next line
+            return context1
         def bot(*args, retry=False):
             """
             bot that consumes history for user input
                 history = []
                 yield history, ''
                 return
             instruction1 = history[-1][0]
             if not instruction1:
                 # reject empty query, can sometimes go nuts
                 history = []
                 yield history, ''
                 return
+            prompt_type1 = args_list[eval_func_param_names.index('prompt_type')]
+            chat1 = args_list[eval_func_param_names.index('chat')]
+            context1 = history_to_context(history, langchain_mode1, prompt_type1, chat1)
             args_list[0] = instruction1  # override original instruction with history from user
             args_list[2] = context1
             fun1 = partial(evaluate,
                            my_db_state1,
                            **kwargs_evaluate)
             try:
+                for output_fun in fun1(*tuple(args_list)):
+                    output = output_fun['response']
+                    extra = output_fun['sources']  # FIXME: can show sources in separate text box etc.
+                    # ensure good visually, else markdown ignores multiple \n
+                    bot_message = output.replace('\n', '<br>')
                     history[-1][1] = bot_message
                     yield history, ''
             except StopIteration:
                 if len(stepy) != 2:
                     # something off
                     return False
+                questionx = stepx[0].replace('<p>', '').replace('</p>', '') if stepx[0] is not None else None
+                answerx = stepx[1].replace('<p>', '').replace('</p>', '') if stepx[1] is not None else None
+                questiony = stepy[0].replace('<p>', '').replace('</p>', '') if stepy[0] is not None else None
+                answery = stepy[1].replace('<p>', '').replace('</p>', '') if stepy[1] is not None else None
                 if questionx != questiony or answerx != answery:
                     return False
                 lora_weights = ''
             all_kwargs1['lora_weights'] = lora_weights.strip()
+            model1, tokenizer1, device1 = get_model(reward_type=False,
+                                                    **get_kwargs(get_model, exclude_names=['reward_type'],
+                                                                 **all_kwargs1))
             clear_torch_cache()
             if kwargs['debug']:
         chatbot_update_args = dict(fn=chatbot_list, inputs=[text_output, model_used], outputs=text_output)
         nochat_update_args = dict(fn=chatbot_list, inputs=[text_output_nochat, model_used], outputs=text_output_nochat)
         if not is_public:
+            load_model_event = load_model_button.click(**load_model_args, api_name='load_model' if allow_api else None) \
                 .then(**prompt_update_args) \
                 .then(**chatbot_update_args) \
                 .then(**nochat_update_args) \
         prompt_update_args2 = dict(fn=dropdown_prompt_type_list, inputs=prompt_type2, outputs=prompt_type2)
         chatbot_update_args2 = dict(fn=chatbot_list, inputs=[text_output2, model_used2], outputs=text_output2)
         if not is_public:
+            load_model_event2 = load_model_button2.click(**load_model_args2,
+                                                         api_name='load_model2' if allow_api else None) \
                 .then(**prompt_update_args2) \
                 .then(**chatbot_update_args2) \
                 .then(clear_torch_cache)
                                 submit_event3d, submit_event3f,
                                 submit_event_nochat],
                        queue=False, api_name='stop' if allow_api else None).then(clear_torch_cache, queue=False)
+        def count_chat_tokens(model_state1, chat1, prompt_type1):
+            if model_state1 and not isinstance(model_state1[1], str):
+                tokenizer = model_state1[1]
+            elif model_state0 and not isinstance(model_state0[1], str):
+                tokenizer = model_state0[1]
+            else:
+                tokenizer = None
+            if tokenizer is not None:
+                langchain_mode1 = 'ChatLLM'
+                # fake user message to mimic bot()
+                chat1 = copy.deepcopy(chat1)
+                chat1 = chat1 + [['user_message1', None]]
+                context1 = history_to_context(chat1, langchain_mode1, prompt_type1, chat1)
+                return str(tokenizer(context1, return_tensors="pt")['input_ids'].shape[1])
+            else:
+                return "N/A"
+        count_chat_tokens_btn.click(fn=count_chat_tokens, inputs=[model_state, text_output, prompt_type],
+                                    outputs=chat_token_count, api_name='count_tokens' if allow_api else None)
         demo.load(None, None, None, _js=get_dark_js() if kwargs['h2ocolors'] else None)
     demo.queue(concurrency_count=kwargs['concurrency_count'], api_open=kwargs['api_open'])
     scheduler = BackgroundScheduler()
     scheduler.add_job(func=clear_torch_cache, trigger="interval", seconds=20)
     if is_public and \
+            kwargs['base_model'] not in non_hf_types:
         # FIXME: disable for gptj, langchain or gpt4all modify print itself
         # FIXME: and any multi-threaded/async print will enter model output!
         scheduler.add_job(func=ping, trigger="interval", seconds=60)
     # import control
     if kwargs['langchain_mode'] == 'Disabled' and \
             os.environ.get("TEST_LANGCHAIN_IMPORT") and \
+            kwargs['base_model'] not in non_hf_types:
         assert 'gpt_langchain' not in sys.modules, "Dev bug, import of langchain when should not have"
         assert 'langchain' not in sys.modules, "Dev bug, import of langchain when should not have"
     demo.launch(share=kwargs['share'], server_name="0.0.0.0", show_error=True,
                 favicon_path=favicon_path, prevent_thread_lock=True,
                 auth=kwargs['auth'])
+    if kwargs['verbose']:
+        print("Started GUI", flush=True)
     if kwargs['block_gradio_exit']:
         demo.block_thread()
     return inputs_list
+def get_sources(db1, langchain_mode, dbs=None, docs_state0=None):
     if langchain_mode in ['ChatLLM', 'LLM']:
         source_files_added = "NA"
         source_list = []
     sources_file = 'sources_%s_%s' % (langchain_mode, str(uuid.uuid4()))
     with open(sources_file, "wt") as f:
         f.write(source_files_added)
+    source_list = docs_state0 + source_list
     return sources_file, source_list
         if langchain_mode == 'MyData':
             if db1[0] is not None:
                 # then add
+                db, num_new_sources, new_sources_metadata = add_to_db(db1[0], sources, db_type=db_type)
             else:
                 assert len(db1) == 2 and db1[1] is None, "Bad MyData db: %s" % db1
                 # then create
                                 hf_embedding_model=hf_embedding_model)
                 if db1[0] is None:
                     db1[1] = None
+            source_files_added = get_source_files(db=db1[0], exceptions=exceptions)
             return db1, x, y, source_files_added
         else:
             persist_directory = 'db_dir_%s' % langchain_mode
             if langchain_mode in dbs and dbs[langchain_mode] is not None:
                 # then add
+                db, num_new_sources, new_sources_metadata = add_to_db(dbs[langchain_mode], sources, db_type=db_type)
             else:
                 # then create
                 db = get_db(sources, use_openai_embedding=use_openai_embedding,
             # NOTE we do not return db, because function call always same code path
             # return dbs[langchain_mode], x, y
             # db in this code path is updated in place
+            source_files_added = get_source_files(db=dbs[langchain_mode], exceptions=exceptions)
             return x, y, source_files_added
+def get_db(db1, langchain_mode, dbs=None):
     with filelock.FileLock("db_%s.lock" % langchain_mode.replace(' ', '_')):
         if langchain_mode in ['wiki_full']:
             # NOTE: avoid showing full wiki.  Takes about 30 seconds over about 90k entries, but not useful for now
             db = dbs[langchain_mode]
         else:
             db = None
+    return db
+def get_source_files_given_langchain_mode(db1, langchain_mode='UserData', dbs=None):
+    db = get_db(db1, langchain_mode, dbs=dbs)
+    return get_source_files(db=db, exceptions=None)
+def get_source_files(db=None, exceptions=None, metadatas=None):
     if exceptions is None:
         exceptions = []
+    # only should be one source, not confused
+    assert db is not None or metadatas is not None
+    if metadatas is None:
+        source_label = "Sources:"
+        if db is not None:
+            metadatas = db.get()['metadatas']
+        else:
+            metadatas = []
+        adding_new = False
     else:
+        source_label = "New Sources:"
+        adding_new = True
     # below automatically de-dups
     from gpt_langchain import get_url
         <html>
           <body>
             <p>
+               {0} <br>
             </p>
                <div style="overflow-y: auto;height:400px">
                {1}
+               {2}
                </div>
           </body>
         </html>
+        """.format(source_label, source_files_added, exceptions_html)
     elif metadatas:
         source_files_added = """\
         <html>
           <body>
             <p>
+               {0} <br>
             </p>
                <div style="overflow-y: auto;height:400px">
+               {1}
                </div>
           </body>
         </html>
+        """.format(source_label, source_files_added)
     elif exceptions_html:
         source_files_added = """\
         <html>
         </html>
         """.format(exceptions_html)
     else:
+        if adding_new:
+            source_files_added = "No New Sources"
+        else:
+            source_files_added = "No Sources"
     return source_files_added
+def update_and_get_source_files_given_langchain_mode(db1, langchain_mode, dbs=None, first_para=None,
+                                                     text_limit=None, chunk=None, chunk_size=None,
+                                                     user_path=None, db_type=None, load_db_if_exists=None,
+                                                     n_jobs=None, verbose=None):
+    db = get_db(db1, langchain_mode, dbs=dbs)
+    from gpt_langchain import make_db
+    db, num_new_sources, new_sources_metadata = make_db(use_openai_embedding=False,
+                                                        hf_embedding_model="sentence-transformers/all-MiniLM-L6-v2",
+                                                        first_para=first_para, text_limit=text_limit, chunk=chunk,
+                                                        chunk_size=chunk_size,
+                                                        langchain_mode=langchain_mode,
+                                                        user_path=user_path,
+                                                        db_type=db_type,
+                                                        load_db_if_exists=load_db_if_exists,
+                                                        db=db,
+                                                        n_jobs=n_jobs,
+                                                        verbose=verbose)
+    # return only new sources with text saying such
+    return get_source_files(db=None, exceptions=None, metadatas=new_sources_metadata)

gradio_themes.py CHANGED Viewed

@@ -1,7 +1,10 @@
 from __future__ import annotations
 from gradio.themes.soft import Soft
 from gradio.themes import Color
-from gradio.themes.utils import colors, sizes
 h2o_yellow = Color(
     name="yellow",
@@ -43,6 +46,22 @@ class H2oTheme(Soft):
             spacing_size: sizes.Size | str = sizes.spacing_md,
             radius_size: sizes.Size | str = sizes.radius_md,
             text_size: sizes.Size | str = sizes.text_lg,
     ):
         super().__init__(
             primary_hue=primary_hue,
@@ -51,6 +70,8 @@ class H2oTheme(Soft):
             spacing_size=spacing_size,
             radius_size=radius_size,
             text_size=text_size,
         )
         super().set(
             link_text_color="#3344DD",
@@ -89,6 +110,22 @@ class SoftTheme(Soft):
             spacing_size: sizes.Size | str = sizes.spacing_md,
             radius_size: sizes.Size | str = sizes.radius_md,
             text_size: sizes.Size | str = sizes.text_md,
     ):
         super().__init__(
             primary_hue=primary_hue,
@@ -97,6 +134,8 @@ class SoftTheme(Soft):
             spacing_size=spacing_size,
             radius_size=radius_size,
             text_size=text_size,
         )
@@ -125,7 +164,7 @@ def get_h2o_title(title):
                     <h1 style="line-height:60px">{title}</h1>
                 </div>
                 <div style="float:right; height: 80px; width: 80px; margin-top:-100px">
-                    <img src=https://raw.githubusercontent.com/h2oai/h2ogpt/main/h2o-qr.png></img>
                 </div>
                 """

 from __future__ import annotations
+from typing import Iterable
 from gradio.themes.soft import Soft
 from gradio.themes import Color
+from gradio.themes.utils import colors, sizes, fonts
 h2o_yellow = Color(
     name="yellow",
             spacing_size: sizes.Size | str = sizes.spacing_md,
             radius_size: sizes.Size | str = sizes.radius_md,
             text_size: sizes.Size | str = sizes.text_lg,
+            font: fonts.Font
+            | str
+            | Iterable[fonts.Font | str] = (
+                fonts.GoogleFont("Montserrat"),
+                "ui-sans-serif",
+                "system-ui",
+                "sans-serif",
+            ),
+            font_mono: fonts.Font
+            | str
+            | Iterable[fonts.Font | str] = (
+                fonts.GoogleFont("IBM Plex Mono"),
+                "ui-monospace",
+                "Consolas",
+                "monospace",
+            ),
     ):
         super().__init__(
             primary_hue=primary_hue,
             spacing_size=spacing_size,
             radius_size=radius_size,
             text_size=text_size,
+            font=font,
+            font_mono=font_mono,
         )
         super().set(
             link_text_color="#3344DD",
             spacing_size: sizes.Size | str = sizes.spacing_md,
             radius_size: sizes.Size | str = sizes.radius_md,
             text_size: sizes.Size | str = sizes.text_md,
+            font: fonts.Font
+            | str
+            | Iterable[fonts.Font | str] = (
+                fonts.GoogleFont("Montserrat"),
+                "ui-sans-serif",
+                "system-ui",
+                "sans-serif",
+            ),
+            font_mono: fonts.Font
+            | str
+            | Iterable[fonts.Font | str] = (
+                fonts.GoogleFont("IBM Plex Mono"),
+                "ui-monospace",
+                "Consolas",
+                "monospace",
+            ),
     ):
         super().__init__(
             primary_hue=primary_hue,
             spacing_size=spacing_size,
             radius_size=radius_size,
             text_size=text_size,
+            font=font,
+            font_mono=font_mono,
         )
                     <h1 style="line-height:60px">{title}</h1>
                 </div>
                 <div style="float:right; height: 80px; width: 80px; margin-top:-100px">
+                    <img src=https://raw.githubusercontent.com/h2oai/h2ogpt/main/docs/h2o-qr.png></img>
                 </div>
                 """

h2oai_pipeline.py CHANGED Viewed

@@ -2,36 +2,57 @@ from transformers import TextGenerationPipeline
 from transformers.pipelines.text_generation import ReturnType
 from stopping import get_stopping
-prompt_type = "human_bot"
-human = "<human>:"
-bot = "<bot>:"
-# human-bot interaction like OIG dataset
-prompt = """{human} {instruction}
-{bot}""".format(
-    human=human,
-    instruction="{instruction}",
-    bot=bot,
-)
 class H2OTextGenerationPipeline(TextGenerationPipeline):
-    def __init__(self, *args, use_prompter=False, debug=False, chat=False, stream_output=False,
-                 sanitize_bot_response=True, **kwargs):
         super().__init__(*args, **kwargs)
-        self.use_prompter = use_prompter
         self.prompt_text = None
         if self.use_prompter:
-            from prompter import Prompter
-            self.prompter = Prompter(prompt_type, debug=debug, chat=chat, stream_output=stream_output)
         else:
             self.prompter = None
         self.sanitize_bot_response = sanitize_bot_response
     def preprocess(self, prompt_text, prefix="", handle_long_generation=None, **generate_kwargs):
-        prompt_text = prompt.format(instruction=prompt_text)
         self.prompt_text = prompt_text
         return super().preprocess(prompt_text, prefix=prefix, handle_long_generation=handle_long_generation,
                                   **generate_kwargs)
@@ -43,12 +64,65 @@ class H2OTextGenerationPipeline(TextGenerationPipeline):
                 outputs = rec['generated_text']
                 outputs = self.prompter.get_response(outputs, prompt=self.prompt_text,
                                                      sanitize_bot_response=self.sanitize_bot_response)
             else:
-                outputs = rec['generated_text'].split(bot)[1].strip().split(human)[0].strip()
             rec['generated_text'] = outputs
         return records
     def _forward(self, model_inputs, **generate_kwargs):
-        stopping_criteria = get_stopping(prompt_type, self.tokenizer, self.device, human=human, bot=bot)
-        generate_kwargs['stopping_criteria'] = stopping_criteria
-        return super()._forward(model_inputs, **generate_kwargs)

 from transformers.pipelines.text_generation import ReturnType
 from stopping import get_stopping
+from prompter import Prompter
 class H2OTextGenerationPipeline(TextGenerationPipeline):
+    def __init__(self, *args, debug=False, chat=False, stream_output=False,
+                 sanitize_bot_response=True,
+                 use_prompter=True, prompter=None, prompt_type=None,
+                 max_input_tokens=2048 - 256, **kwargs):
+        """
+        HF-like pipeline, but handle instruction prompting and stopping (for some models)
+        :param args:
+        :param debug:
+        :param chat:
+        :param stream_output:
+        :param sanitize_bot_response:
+        :param use_prompter: Whether to use prompter.  If pass prompt_type, will make prompter
+        :param prompter: prompter, can pass if have already
+        :param prompt_type: prompt_type, e.g. human_bot.  See prompt_type to model mapping in from prompter.py.
+                            If use_prompter, then will make prompter and use it.
+        :param max_input_tokens:
+        :param kwargs:
+        """
         super().__init__(*args, **kwargs)
         self.prompt_text = None
+        self.use_prompter = use_prompter
+        self.prompt_type = prompt_type
+        self.prompter = prompter
         if self.use_prompter:
+            if self.prompter is not None:
+                assert self.prompter.prompt_type is not None
+            else:
+                self.prompter = Prompter(self.prompt_type, debug=debug, chat=chat, stream_output=stream_output)
+            self.human = self.prompter.humanstr
+            self.bot = self.prompter.botstr
+            self.can_stop = True
         else:
             self.prompter = None
+            self.human = None
+            self.bot = None
+            self.can_stop = False
         self.sanitize_bot_response = sanitize_bot_response
+        self.max_input_tokens = max_input_tokens  # not for generate, so ok that not kwargs
     def preprocess(self, prompt_text, prefix="", handle_long_generation=None, **generate_kwargs):
+        data_point = dict(context='', instruction=prompt_text, input='')
+        if self.prompter is not None:
+            prompt_text = self.prompter.generate_prompt(data_point)
         self.prompt_text = prompt_text
+        if handle_long_generation is None:
+            # forces truncation of inputs to avoid critical failure
+            handle_long_generation = 'hole'
         return super().preprocess(prompt_text, prefix=prefix, handle_long_generation=handle_long_generation,
                                   **generate_kwargs)
                 outputs = rec['generated_text']
                 outputs = self.prompter.get_response(outputs, prompt=self.prompt_text,
                                                      sanitize_bot_response=self.sanitize_bot_response)
+            elif self.bot and self.human:
+                outputs = rec['generated_text'].split(self.bot)[1].strip().split(self.human)[0].strip()
             else:
+                outputs = rec['generated_text']
             rec['generated_text'] = outputs
         return records
     def _forward(self, model_inputs, **generate_kwargs):
+        if self.can_stop:
+            stopping_criteria = get_stopping(self.prompt_type, self.tokenizer, self.device, human=self.human,
+                                             bot=self.bot)
+            generate_kwargs['stopping_criteria'] = stopping_criteria
+        # return super()._forward(model_inputs, **generate_kwargs)
+        return self.__forward(model_inputs, **generate_kwargs)
+    # FIXME: Copy-paste of original _forward, but removed copy.deepcopy()
+    # FIXME: https://github.com/h2oai/h2ogpt/issues/172
+    def __forward(self, model_inputs, **generate_kwargs):
+        input_ids = model_inputs["input_ids"]
+        attention_mask = model_inputs.get("attention_mask", None)
+        # Allow empty prompts
+        if input_ids.shape[1] == 0:
+            input_ids = None
+            attention_mask = None
+            in_b = 1
+        else:
+            in_b = input_ids.shape[0]
+        prompt_text = model_inputs.pop("prompt_text")
+        ## If there is a prefix, we may need to adjust the generation length. Do so without permanently modifying
+        ## generate_kwargs, as some of the parameterization may come from the initialization of the pipeline.
+        # generate_kwargs = copy.deepcopy(generate_kwargs)
+        prefix_length = generate_kwargs.pop("prefix_length", 0)
+        if prefix_length > 0:
+            has_max_new_tokens = "max_new_tokens" in generate_kwargs or (
+                    "generation_config" in generate_kwargs
+                    and generate_kwargs["generation_config"].max_new_tokens is not None
+            )
+            if not has_max_new_tokens:
+                generate_kwargs["max_length"] = generate_kwargs.get("max_length") or self.model.config.max_length
+                generate_kwargs["max_length"] += prefix_length
+            has_min_new_tokens = "min_new_tokens" in generate_kwargs or (
+                    "generation_config" in generate_kwargs
+                    and generate_kwargs["generation_config"].min_new_tokens is not None
+            )
+            if not has_min_new_tokens and "min_length" in generate_kwargs:
+                generate_kwargs["min_length"] += prefix_length
+        # BS x SL
+        generated_sequence = self.model.generate(input_ids=input_ids, attention_mask=attention_mask, **generate_kwargs)
+        out_b = generated_sequence.shape[0]
+        if self.framework == "pt":
+            generated_sequence = generated_sequence.reshape(in_b, out_b // in_b, *generated_sequence.shape[1:])
+        elif self.framework == "tf":
+            from transformers import is_tf_available
+            if is_tf_available():
+                import tensorflow as tf
+                generated_sequence = tf.reshape(generated_sequence,
+                                                (in_b, out_b // in_b, *generated_sequence.shape[1:]))
+            else:
+                raise ValueError("TF not avaialble.")
+        return {"generated_sequence": generated_sequence, "input_ids": input_ids, "prompt_text": prompt_text}

prompter.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import time
 from enum import Enum
 class PromptType(Enum):
     plain = 0
@@ -17,6 +19,10 @@ class PromptType(Enum):
     open_assistant = 11
     wizard_lm = 12
     wizard_mega = 13
 prompt_type_to_model_name = {
@@ -26,6 +32,7 @@ prompt_type_to_model_name = {
         'EleutherAI/pythia-12b',
         'EleutherAI/pythia-12b-deduped',
         'EleutherAI/gpt-neox-20b',
         'decapoda-research/llama-7b-hf',
         'decapoda-research/llama-13b-hf',
         'decapoda-research/llama-30b-hf',
@@ -39,7 +46,8 @@ prompt_type_to_model_name = {
         'mosaicml/mpt-7b-instruct',  # internal code handles instruct
         'mosaicml/mpt-7b-chat',  # NC, internal code handles instruct
         'gptj',  # internally handles prompting
-        'llama',  # internally handles prompting
     ],
     'prompt_answer': [
         'h2oai/h2ogpt-gm-oasst1-en-1024-20b',
@@ -47,6 +55,7 @@ prompt_type_to_model_name = {
         'h2oai/h2ogpt-gm-oasst1-multilang-1024-20b',
         'h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt',
         'h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2',
     ],
     'instruct': [],
     'instruct_with_end': ['databricks/dolly-v2-12b'],
@@ -58,7 +67,9 @@ prompt_type_to_model_name = {
         'h2oai/h2ogpt-oig-oasst1-512-6_9b',
         'h2oai/h2ogpt-oig-oasst1-256-6.9b',  # legacy
         'h2oai/h2ogpt-oig-oasst1-512-6.9b',  # legacy
-        'h2oai/h2ogpt-research-oasst1-512-30b',  # private
     ],
     'dai_faq': [],
     'summarize': [],
@@ -83,7 +94,8 @@ for p in PromptType:
 def get_prompt(prompt_type, chat, context, reduced):
-    if prompt_type in [-1, "-1", "plain"]:
         promptA = promptB = PreInstruct = PreInput = PreResponse = ''
         terminate_response = []
         chat_sep = ''
@@ -95,11 +107,14 @@ def get_prompt(prompt_type, chat, context, reduced):
         chat_sep = '\n'
         humanstr = ''
         botstr = ''
-    elif prompt_type in [0, "0", "instruct"] or prompt_type in [7, "7", "instruct_with_end"]:
         promptA = 'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n' if not (
-                    chat and reduced) else ''
         promptB = 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n' if not (
-                    chat and reduced) else ''
         PreInstruct = """
 ### Instruction:
@@ -112,18 +127,20 @@ def get_prompt(prompt_type, chat, context, reduced):
         PreResponse = """
 ### Response:
 """
-        if prompt_type in [7, "7", "instruct_with_end"]:
             terminate_response = ['### End']
         else:
             terminate_response = None
         chat_sep = '\n'
         humanstr = PreInstruct
         botstr = PreResponse
-    elif prompt_type in [1, "1", "quality"]:
         promptA = 'Write a detailed high-quality, accurate, fair, Response with about 100 words by following the Instruction as applied on the Input.\n' if not (
-                    chat and reduced) else ''
         promptB = 'Write a detailed high-quality, accurate, fair, Response with about 100 words by following the Instruction.\n' if not (
-                    chat and reduced) else ''
         PreInstruct = """
 ### Instruction:
@@ -140,10 +157,14 @@ def get_prompt(prompt_type, chat, context, reduced):
         chat_sep = '\n'
         humanstr = PreInstruct  # first thing human says
         botstr = PreResponse  # first thing bot says
-    elif prompt_type in [2, "2", "human_bot", 9, "9", "human_bot_orig"]:
         human = '<human>:'
         bot = "<bot>:"
-        if reduced or context or prompt_type in [2, "2", "human_bot"]:
             preprompt = ''
         else:
             cur_date = time.strftime('%Y-%m-%d')
@@ -174,7 +195,8 @@ Current Time: {}
         chat_sep = '\n'
         humanstr = human  # tag before human talks
         botstr = bot  # tag before bot talks
-    elif prompt_type in [3, "3", "dai_faq"]:
         promptA = ''
         promptB = 'Answer the following Driverless AI question.\n'
@@ -191,7 +213,8 @@ Current Time: {}
         chat_sep = terminate_response
         humanstr = PreInstruct
         botstr = PreResponse
-    elif prompt_type in [5, "5", "summarize"]:
         promptA = promptB = PreInput = ''
         PreInstruct = '## Main Text\n\n'
         PreResponse = '\n\n## Summary\n\n'
@@ -199,10 +222,11 @@ Current Time: {}
         chat_sep = '\n'
         humanstr = PreInstruct
         botstr = PreResponse
-    elif prompt_type in [6, "6", "instruct_vicuna"]:
         promptA = promptB = "A chat between a curious human and an artificial intelligence assistant. " \
                             "The assistant gives helpful, detailed, and polite answers to the human's questions." if not (
-                    chat and reduced) else ''
         PreInstruct = """
 ### Human:
@@ -218,7 +242,8 @@ Current Time: {}
         chat_sep = '\n'
         humanstr = PreInstruct
         botstr = PreResponse
-    elif prompt_type in [10, "10", "prompt_answer"]:
         preprompt = ''
         prompt_tokens = "<|prompt|>"
         answer_tokens = "<|answer|>"
@@ -232,7 +257,8 @@ Current Time: {}
         chat_sep = eos
         humanstr = prompt_tokens
         botstr = answer_tokens
-    elif prompt_type in [11, "11", "open_assistant"]:
         # From added_tokens.json
         preprompt = ''
         prompt_tokens = "<|prompter|>"
@@ -248,20 +274,22 @@ Current Time: {}
         chat_sep = eos
         humanstr = prompt_tokens
         botstr = answer_tokens
-    elif prompt_type in [12, "12", "wizard_lm"]:
         # https://github.com/ehartford/WizardLM/blob/main/src/train_freeform.py
         preprompt = ''
         start = ''
         promptB = promptA = '%s%s' % (preprompt, start)
         PreInstruct = ""
         PreInput = None
-        PreResponse = "\n\n### Response"
         eos = "</s>"
         terminate_response = [PreResponse, eos]
         chat_sep = eos
         humanstr = promptA
         botstr = PreResponse
-    elif prompt_type in [13, "13", "wizard_mega"]:
         preprompt = ''
         start = ''
         promptB = promptA = '%s%s' % (preprompt, start)
@@ -276,6 +304,75 @@ Current Time: {}
         chat_sep = '\n'
         humanstr = PreInstruct
         botstr = PreResponse
     else:
         raise RuntimeError("No such prompt_type=%s" % prompt_type)
@@ -412,7 +509,7 @@ class Prompter(object):
         multi_output = len(outputs) > 1
         for oi, output in enumerate(outputs):
-            if self.prompt_type in [0, '0', 'plain']:
                 output = clean_response(output)
             elif prompt is None:
                 # then use most basic parsing like pipeline

 import time
 from enum import Enum
+non_hf_types = ['gpt4all_llama', 'llama', 'gptj']
 class PromptType(Enum):
     plain = 0
     open_assistant = 11
     wizard_lm = 12
     wizard_mega = 13
+    instruct_vicuna2 = 14
+    instruct_vicuna3 = 15
+    wizard2 = 16
+    wizard3 = 17
 prompt_type_to_model_name = {
         'EleutherAI/pythia-12b',
         'EleutherAI/pythia-12b-deduped',
         'EleutherAI/gpt-neox-20b',
+        'openlm-research/open_llama_7b_700bt_preview',
         'decapoda-research/llama-7b-hf',
         'decapoda-research/llama-13b-hf',
         'decapoda-research/llama-30b-hf',
         'mosaicml/mpt-7b-instruct',  # internal code handles instruct
         'mosaicml/mpt-7b-chat',  # NC, internal code handles instruct
         'gptj',  # internally handles prompting
+        'llama',  # plain, or need to choose prompt_type for given TheBloke model
+        'gpt4all_llama',  # internally handles prompting
     ],
     'prompt_answer': [
         'h2oai/h2ogpt-gm-oasst1-en-1024-20b',
         'h2oai/h2ogpt-gm-oasst1-multilang-1024-20b',
         'h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt',
         'h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2',
+        'h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-700bt',
     ],
     'instruct': [],
     'instruct_with_end': ['databricks/dolly-v2-12b'],
         'h2oai/h2ogpt-oig-oasst1-512-6_9b',
         'h2oai/h2ogpt-oig-oasst1-256-6.9b',  # legacy
         'h2oai/h2ogpt-oig-oasst1-512-6.9b',  # legacy
+        'h2oai/h2ogpt-research-oasst1-512-30b',
+        'h2oai/h2ogpt-oasst1-falcon-40b',
+        'h2oai/h2ogpt-oig-oasst1-falcon-40b',
     ],
     'dai_faq': [],
     'summarize': [],
 def get_prompt(prompt_type, chat, context, reduced):
+    if prompt_type in [PromptType.plain.value, str(PromptType.plain.value),
+                       PromptType.plain.name]:
         promptA = promptB = PreInstruct = PreInput = PreResponse = ''
         terminate_response = []
         chat_sep = ''
         chat_sep = '\n'
         humanstr = ''
         botstr = ''
+    elif prompt_type in [PromptType.instruct.value, str(PromptType.instruct.value),
+                         PromptType.instruct.name] + [PromptType.instruct_with_end.value,
+                                                      str(PromptType.instruct_with_end.value),
+                                                      PromptType.instruct_with_end.name]:
         promptA = 'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n' if not (
+                chat and reduced) else ''
         promptB = 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n' if not (
+                chat and reduced) else ''
         PreInstruct = """
 ### Instruction:
         PreResponse = """
 ### Response:
 """
+        if prompt_type in [PromptType.instruct_with_end.value, str(PromptType.instruct_with_end.value),
+                           PromptType.instruct_with_end.name]:
             terminate_response = ['### End']
         else:
             terminate_response = None
         chat_sep = '\n'
         humanstr = PreInstruct
         botstr = PreResponse
+    elif prompt_type in [PromptType.quality.value, str(PromptType.quality.value),
+                         PromptType.quality.name]:
         promptA = 'Write a detailed high-quality, accurate, fair, Response with about 100 words by following the Instruction as applied on the Input.\n' if not (
+                chat and reduced) else ''
         promptB = 'Write a detailed high-quality, accurate, fair, Response with about 100 words by following the Instruction.\n' if not (
+                chat and reduced) else ''
         PreInstruct = """
 ### Instruction:
         chat_sep = '\n'
         humanstr = PreInstruct  # first thing human says
         botstr = PreResponse  # first thing bot says
+    elif prompt_type in [PromptType.human_bot.value, str(PromptType.human_bot.value),
+                         PromptType.human_bot.name] + [PromptType.human_bot_orig.value,
+                                                       str(PromptType.human_bot_orig.value),
+                                                       PromptType.human_bot_orig.name]:
         human = '<human>:'
         bot = "<bot>:"
+        if reduced or context or prompt_type in [PromptType.human_bot.value, str(PromptType.human_bot.value),
+                                                 PromptType.human_bot.name]:
             preprompt = ''
         else:
             cur_date = time.strftime('%Y-%m-%d')
         chat_sep = '\n'
         humanstr = human  # tag before human talks
         botstr = bot  # tag before bot talks
+    elif prompt_type in [PromptType.dai_faq.value, str(PromptType.dai_faq.value),
+                         PromptType.dai_faq.name]:
         promptA = ''
         promptB = 'Answer the following Driverless AI question.\n'
         chat_sep = terminate_response
         humanstr = PreInstruct
         botstr = PreResponse
+    elif prompt_type in [PromptType.summarize.value, str(PromptType.summarize.value),
+                         PromptType.summarize.name]:
         promptA = promptB = PreInput = ''
         PreInstruct = '## Main Text\n\n'
         PreResponse = '\n\n## Summary\n\n'
         chat_sep = '\n'
         humanstr = PreInstruct
         botstr = PreResponse
+    elif prompt_type in [PromptType.instruct_vicuna.value, str(PromptType.instruct_vicuna.value),
+                         PromptType.instruct_vicuna.name]:
         promptA = promptB = "A chat between a curious human and an artificial intelligence assistant. " \
                             "The assistant gives helpful, detailed, and polite answers to the human's questions." if not (
+                chat and reduced) else ''
         PreInstruct = """
 ### Human:
         chat_sep = '\n'
         humanstr = PreInstruct
         botstr = PreResponse
+    elif prompt_type in [PromptType.prompt_answer.value, str(PromptType.prompt_answer.value),
+                         PromptType.prompt_answer.name]:
         preprompt = ''
         prompt_tokens = "<|prompt|>"
         answer_tokens = "<|answer|>"
         chat_sep = eos
         humanstr = prompt_tokens
         botstr = answer_tokens
+    elif prompt_type in [PromptType.open_assistant.value, str(PromptType.open_assistant.value),
+                         PromptType.open_assistant.name]:
         # From added_tokens.json
         preprompt = ''
         prompt_tokens = "<|prompter|>"
         chat_sep = eos
         humanstr = prompt_tokens
         botstr = answer_tokens
+    elif prompt_type in [PromptType.wizard_lm.value, str(PromptType.wizard_lm.value),
+                         PromptType.wizard_lm.name]:
         # https://github.com/ehartford/WizardLM/blob/main/src/train_freeform.py
         preprompt = ''
         start = ''
         promptB = promptA = '%s%s' % (preprompt, start)
         PreInstruct = ""
         PreInput = None
+        PreResponse = "\n\n### Response\n"
         eos = "</s>"
         terminate_response = [PreResponse, eos]
         chat_sep = eos
         humanstr = promptA
         botstr = PreResponse
+    elif prompt_type in [PromptType.wizard_mega.value, str(PromptType.wizard_mega.value),
+                         PromptType.wizard_mega.name]:
         preprompt = ''
         start = ''
         promptB = promptA = '%s%s' % (preprompt, start)
         chat_sep = '\n'
         humanstr = PreInstruct
         botstr = PreResponse
+    elif prompt_type in [PromptType.instruct_vicuna2.value, str(PromptType.instruct_vicuna2.value),
+                         PromptType.instruct_vicuna2.name]:
+        promptA = promptB = "" if not (
+                chat and reduced) else ''
+        PreInstruct = """
+HUMAN:
+"""
+        PreInput = None
+        PreResponse = """
+ASSISTANT:
+"""
+        terminate_response = [
+            'HUMAN:']  # but only allow terminate after prompt is found correctly, else can't terminate
+        chat_sep = '\n'
+        humanstr = PreInstruct
+        botstr = PreResponse
+    elif prompt_type in [PromptType.instruct_vicuna3.value, str(PromptType.instruct_vicuna3.value),
+                         PromptType.instruct_vicuna3.name]:
+        promptA = promptB = "" if not (
+                chat and reduced) else ''
+        PreInstruct = """
+### User:
+"""
+        PreInput = None
+        PreResponse = """
+### Assistant:
+"""
+        terminate_response = [
+            '### User:']  # but only allow terminate after prompt is found correctly, else can't terminate
+        chat_sep = '\n'
+        humanstr = PreInstruct
+        botstr = PreResponse
+    elif prompt_type in [PromptType.wizard2.value, str(PromptType.wizard2.value),
+                         PromptType.wizard2.name]:
+        # https://huggingface.co/TheBloke/WizardLM-7B-uncensored-GGML
+        preprompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request."""
+        start = ''
+        promptB = promptA = '%s%s' % (preprompt, start)
+        PreInstruct = """
+### Instruction:
+"""
+        PreInput = None
+        PreResponse = """
+### Response:
+"""
+        terminate_response = [PreResponse]
+        chat_sep = '\n'
+        humanstr = PreInstruct
+        botstr = PreResponse
+    elif prompt_type in [PromptType.wizard3.value, str(PromptType.wizard3.value),
+                         PromptType.wizard3.name]:
+        # https://huggingface.co/TheBloke/wizardLM-13B-1.0-GGML
+        preprompt = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions."""
+        start = ''
+        promptB = promptA = '%s%s' % (preprompt, start)
+        PreInstruct = """USER: """
+        PreInput = None
+        PreResponse = """ASSISTANT: """
+        terminate_response = [PreResponse]
+        chat_sep = '\n'
+        humanstr = PreInstruct
+        botstr = PreResponse
     else:
         raise RuntimeError("No such prompt_type=%s" % prompt_type)
         multi_output = len(outputs) > 1
         for oi, output in enumerate(outputs):
+            if self.prompt_type in [PromptType.plain.value, str(PromptType.plain.value), PromptType.plain.name]:
                 output = clean_response(output)
             elif prompt is None:
                 # then use most basic parsing like pipeline

requirements.txt CHANGED Viewed

@@ -1,7 +1,6 @@
 # for generate (gradio server) and finetune
 datasets==2.12.0
 sentencepiece==0.1.97
-accelerate==0.18.0
 gradio==3.31.0
 huggingface_hub==0.14.1
 appdirs==1.4.4
@@ -18,8 +17,9 @@ numpy==1.24.2
 pandas==2.0.0
 matplotlib==3.7.1
 loralib==0.1.1
-bitsandbytes==0.38.1
-git+https://github.com/huggingface/peft.git@098962fa6515f2e4fe83a757f5995d3ffbb1c373
 transformers==4.28.1
 tokenizers==0.13.3
 APScheduler==3.10.1
@@ -50,18 +50,15 @@ pypandoc_binary==1.11
 openpyxl==3.1.2
 lm_dataformat==0.0.20
 bioc==2.0
-# To install with constraints
-# grep -v '#\|peft' requirements.txt > req_constraints.txt ; pip install -r requirements_optional_langchain.txt -c req_constraints.txt
 # optional for chat with PDF
-langchain==0.0.178
 pypdf==3.8.1
 tiktoken==0.3.3
 # avoid textract, requires old six
 #textract==1.6.5
-# choose:
-#faiss-cpu
-faiss-gpu==1.7.2
 # for HF embeddings
 sentence_transformers==2.2.2
@@ -69,7 +66,7 @@ sentence_transformers==2.2.2
 openai==0.27.6
 # local vector db
-chromadb==0.3.23
 # server vector db
 #pymilvus==2.2.8
@@ -92,8 +89,12 @@ requests_file==1.5.1
 tabulate==0.9.0
 # FYI pandoc already part of requirements.txt
-jq==1.4.1
 # to check licenses
 # Run: pip-licenses|grep -v 'BSD\|Apache\|MIT'
 pip-licenses==4.3.0

 # for generate (gradio server) and finetune
 datasets==2.12.0
 sentencepiece==0.1.97
 gradio==3.31.0
 huggingface_hub==0.14.1
 appdirs==1.4.4
 pandas==2.0.0
 matplotlib==3.7.1
 loralib==0.1.1
+bitsandbytes==0.39.0
+accelerate==0.19.0
+git+https://github.com/huggingface/peft.git@3714aa2fff158fdfa637b2b65952580801d890b2
 transformers==4.28.1
 tokenizers==0.13.3
 APScheduler==3.10.1
 openpyxl==3.1.2
 lm_dataformat==0.0.20
 bioc==2.0
+# falcon
+einops==0.6.1
 # optional for chat with PDF
+langchain==0.0.183
 pypdf==3.8.1
 tiktoken==0.3.3
 # avoid textract, requires old six
 #textract==1.6.5
 # for HF embeddings
 sentence_transformers==2.2.2
 openai==0.27.6
 # local vector db
+chromadb==0.3.25
 # server vector db
 #pymilvus==2.2.8
 tabulate==0.9.0
 # FYI pandoc already part of requirements.txt
+# JSONLoader, but makes some trouble for some users
+# jq==1.4.1
 # to check licenses
 # Run: pip-licenses|grep -v 'BSD\|Apache\|MIT'
 pip-licenses==4.3.0
+# weaviate vector db
+weaviate-client==3.19.2

stopping.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import torch
 from transformers import StoppingCriteria, StoppingCriteriaList
 class StoppingCriteriaSub(StoppingCriteria):
@@ -24,14 +26,14 @@ class StoppingCriteriaSub(StoppingCriteria):
 def get_stopping(prompt_type, tokenizer, device, human='<human>:', bot="<bot>:"):
-    if prompt_type in ['human_bot', 'instruct_vicuna', 'instruct_with_end']:
-        if prompt_type == 'human_bot':
             # encounters = [prompt.count(human) + 1, prompt.count(bot) + 1]
             # stopping only starts once output is beyond prompt
             # 1 human is enough to trigger, but need 2 bots, because very first view back will be bot we added
             stop_words = [human, bot, '\n' + human, '\n' + bot]
             encounters = [1, 2]
-        elif prompt_type == 'instruct_vicuna':
             # even below is not enough, generic strings and many ways to encode
             stop_words = [
                 '### Human:',
@@ -58,7 +60,7 @@ def get_stopping(prompt_type, tokenizer, device, human='<human>:', bot="<bot>:")
         stop_words_ids = [x if len(x.shape) > 0 else torch.tensor([x]) for x in stop_words_ids]
         stop_words_ids = [x for x in stop_words_ids if x.shape[0] > 0]
         # avoid padding in front of tokens
-        if tokenizer.pad_token:
             stop_words_ids = [x[1:] if x[0] == tokenizer.pad_token_id and len(x) > 1 else x for x in stop_words_ids]
         # handle fake \n added
         stop_words_ids = [x[1:] if y[0] == '\n' else x for x, y in zip(stop_words_ids, stop_words)]

 import torch
 from transformers import StoppingCriteria, StoppingCriteriaList
+from prompter import PromptType
 class StoppingCriteriaSub(StoppingCriteria):
 def get_stopping(prompt_type, tokenizer, device, human='<human>:', bot="<bot>:"):
+    if prompt_type in [PromptType.human_bot.name, PromptType.instruct_vicuna.name, PromptType.instruct_with_end.name]:
+        if prompt_type == PromptType.human_bot.name:
             # encounters = [prompt.count(human) + 1, prompt.count(bot) + 1]
             # stopping only starts once output is beyond prompt
             # 1 human is enough to trigger, but need 2 bots, because very first view back will be bot we added
             stop_words = [human, bot, '\n' + human, '\n' + bot]
             encounters = [1, 2]
+        elif prompt_type == PromptType.instruct_vicuna.name:
             # even below is not enough, generic strings and many ways to encode
             stop_words = [
                 '### Human:',
         stop_words_ids = [x if len(x.shape) > 0 else torch.tensor([x]) for x in stop_words_ids]
         stop_words_ids = [x for x in stop_words_ids if x.shape[0] > 0]
         # avoid padding in front of tokens
+        if tokenizer._pad_token:  # use hidden variable to avoid annoying properly logger bug
             stop_words_ids = [x[1:] if x[0] == tokenizer.pad_token_id and len(x) > 1 else x for x in stop_words_ids]
         # handle fake \n added
         stop_words_ids = [x[1:] if y[0] == '\n' else x for x, y in zip(stop_words_ids, stop_words)]

utils.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import contextlib
 import functools
 import hashlib
 import os
 import gc
 import pathlib
@@ -16,6 +17,8 @@ from datetime import datetime
 import filelock
 import requests, uuid
 from typing import Tuple, Callable, Dict
 from concurrent.futures import ProcessPoolExecutor
 import numpy as np
 import pandas as pd
@@ -371,18 +374,15 @@ def sanitize_filename(name):
     return name
-def shutil_rmtree_simple(*args, **kwargs):
-    path = args[0]
-    assert not os.path.samefile(path, "./tmp"), "Should not be trying to remove entire data directory: %s" % str(path)
-    # print("Removing path %s" % args[0])  # for debugging
     return shutil.rmtree(*args, **kwargs)
-def remove_simple(path: str):
     try:
         if path is not None and os.path.exists(path):
             if os.path.isdir(path):
-                shutil_rmtree_simple(path, ignore_errors=True)
             else:
                 with contextlib.suppress(FileNotFoundError):
                     os.remove(path)
@@ -408,7 +408,7 @@ def atomic_move_simple(src, dst):
         shutil.move(src, dst)
     except (shutil.Error, FileExistsError):
         pass
-    remove_simple(src)
 def download_simple(url, dest=None, print_func=None):
@@ -481,7 +481,7 @@ def download(url, dest=None, dest_path=None):
         shutil.move(dest_tmp, dest)
     except FileExistsError:
         pass
-    remove_simple(dest_tmp)
     return dest
@@ -766,3 +766,78 @@ def call_subprocess_onetask(func, args=None, kwargs=None):
         with ProcessPoolExecutor(max_workers=1) as executor:
             future = executor.submit(_traced_func, *args, **kwargs)
             return future.result()

 import contextlib
 import functools
 import hashlib
+import inspect
 import os
 import gc
 import pathlib
 import filelock
 import requests, uuid
 from typing import Tuple, Callable, Dict
+from tqdm.auto import tqdm
+from joblib import Parallel
 from concurrent.futures import ProcessPoolExecutor
 import numpy as np
 import pandas as pd
     return name
+def shutil_rmtree(*args, **kwargs):
     return shutil.rmtree(*args, **kwargs)
+def remove(path: str):
     try:
         if path is not None and os.path.exists(path):
             if os.path.isdir(path):
+                shutil_rmtree(path, ignore_errors=True)
             else:
                 with contextlib.suppress(FileNotFoundError):
                     os.remove(path)
         shutil.move(src, dst)
     except (shutil.Error, FileExistsError):
         pass
+    remove(src)
 def download_simple(url, dest=None, print_func=None):
         shutil.move(dest_tmp, dest)
     except FileExistsError:
         pass
+    remove(dest_tmp)
     return dest
         with ProcessPoolExecutor(max_workers=1) as executor:
             future = executor.submit(_traced_func, *args, **kwargs)
             return future.result()
+class ProgressParallel(Parallel):
+    def __init__(self, use_tqdm=True, total=None, *args, **kwargs):
+        self._use_tqdm = use_tqdm
+        self._total = total
+        super().__init__(*args, **kwargs)
+    def __call__(self, *args, **kwargs):
+        with tqdm(disable=not self._use_tqdm, total=self._total) as self._pbar:
+            return Parallel.__call__(self, *args, **kwargs)
+    def print_progress(self):
+        if self._total is None:
+            self._pbar.total = self.n_dispatched_tasks
+        self._pbar.n = self.n_completed_tasks
+        self._pbar.refresh()
+def get_kwargs(func, exclude_names=None, **kwargs):
+    func_names = list(inspect.signature(func).parameters)
+    missing_kwargs = [x for x in func_names if x not in kwargs]
+    if exclude_names:
+        for k in exclude_names:
+            if k in missing_kwargs:
+                missing_kwargs.remove(k)
+            if k in func_names:
+                func_names.remove(k)
+    assert not missing_kwargs, "Missing %s" % missing_kwargs
+    kwargs = {k: v for k, v in kwargs.items() if k in func_names}
+    return kwargs
+import pkg_resources
+have_faiss = False
+try:
+    assert pkg_resources.get_distribution('faiss') is not None
+    have_faiss = True
+except (pkg_resources.DistributionNotFound, AssertionError):
+    pass
+try:
+    assert pkg_resources.get_distribution('faiss_gpu') is not None
+    have_faiss = True
+except (pkg_resources.DistributionNotFound, AssertionError):
+    pass
+try:
+    assert pkg_resources.get_distribution('faiss_cpu') is not None
+    have_faiss = True
+except (pkg_resources.DistributionNotFound, AssertionError):
+    pass
+def hash_file(file):
+    try:
+        import hashlib
+        # BUF_SIZE is totally arbitrary, change for your app!
+        BUF_SIZE = 65536  # lets read stuff in 64kb chunks!
+        md5 = hashlib.md5()
+        #sha1 = hashlib.sha1()
+        with open(file, 'rb') as f:
+            while True:
+                data = f.read(BUF_SIZE)
+                if not data:
+                    break
+                md5.update(data)
+                #sha1.update(data)
+    except BaseException as e:
+        print("Cannot hash %s due to %s" % (file, str(e)))
+        traceback.print_exc()
+        md5 = None
+    return md5.hexdigest()