File size: 7,216 Bytes
8cb8054
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
from enum import Enum


class PromptType(Enum):
    custom = -1
    plain = 0
    instruct = 1
    quality = 2
    human_bot = 3
    dai_faq = 4
    summarize = 5
    simple_instruct = 6
    instruct_vicuna = 7
    instruct_with_end = 8
    human_bot_orig = 9
    prompt_answer = 10
    open_assistant = 11
    wizard_lm = 12
    wizard_mega = 13
    instruct_vicuna2 = 14
    instruct_vicuna3 = 15
    wizard2 = 16
    wizard3 = 17
    instruct_simple = 18
    wizard_vicuna = 19
    openai = 20
    openai_chat = 21
    gptj = 22
    prompt_answer_openllama = 23
    vicuna11 = 24
    mptinstruct = 25
    mptchat = 26
    falcon = 27
    guanaco = 28
    llama2 = 29
    beluga = 30
    wizard3nospace = 31
    one_shot = 32
    falcon_chat = 33


class DocumentSubset(Enum):
    Relevant = 0
    RelSources = 1
    TopKSources = 2


non_query_commands = [
    DocumentSubset.RelSources.name,
    DocumentSubset.TopKSources.name
]


class DocumentChoice(Enum):
    ALL = 'All'


class LangChainMode(Enum):
    """LangChain mode"""

    DISABLED = "Disabled"
    LLM = "LLM"
    WIKI = "wiki"
    WIKI_FULL = "wiki_full"
    USER_DATA = "UserData"
    MY_DATA = "MyData"
    GITHUB_H2OGPT = "github h2oGPT"
    H2O_DAI_DOCS = "DriverlessAI docs"


class LangChainTypes(Enum):
    SHARED = 'shared'
    PERSONAL = 'personal'
    EITHER = 'either'  # used when user did not pass which one, so need to try both


# modes should not be removed from visible list or added by name
langchain_modes_intrinsic = [LangChainMode.DISABLED.value,
                             LangChainMode.LLM.value,
                             LangChainMode.MY_DATA.value]

langchain_modes_non_db = [LangChainMode.DISABLED.value,
                          LangChainMode.LLM.value]


class LangChainAction(Enum):
    """LangChain action"""

    QUERY = "Query"
    # WIP:
    # SUMMARIZE_MAP = "Summarize_map_reduce"
    SUMMARIZE_MAP = "Summarize"
    SUMMARIZE_ALL = "Summarize_all"
    SUMMARIZE_REFINE = "Summarize_refine"


class LangChainAgent(Enum):
    """LangChain agents"""

    SEARCH = "Search"
    COLLECTION = "Collection"
    PYTHON = "Python"
    CSV = "CSV"
    PANDAS = "Pandas"
    JSON = 'JSON'


no_server_str = no_lora_str = no_model_str = '[None/Remove]'

# from site-packages/langchain/llms/openai.py
# but needed since ChatOpenAI doesn't have this information
model_token_mapping = {
    "gpt-4": 8192,
    "gpt-4-0314": 8192,
    "gpt-4-32k": 32768,
    "gpt-4-32k-0314": 32768,
    "gpt-3.5-turbo": 4096,
    "gpt-3.5-turbo-16k": 16 * 1024,
    "gpt-3.5-turbo-0301": 4096,
    "text-ada-001": 2049,
    "ada": 2049,
    "text-babbage-001": 2040,
    "babbage": 2049,
    "text-curie-001": 2049,
    "curie": 2049,
    "davinci": 2049,
    "text-davinci-003": 4097,
    "text-davinci-002": 4097,
    "code-davinci-002": 8001,
    "code-davinci-001": 8001,
    "code-cushman-002": 2048,
    "code-cushman-001": 2048,
}

font_size = 2
head_acc = 40  # 40 for 6-way
source_prefix = "Sources [Score | Link]:"
source_postfix = "End Sources<p>"

super_source_prefix = f"""<details><summary><font size="{font_size}">Sources</font></summary><font size="{font_size}"><font size="{font_size}">Sources [Score | Link]:"""
super_source_postfix = f"""End Sources<p></font></font></details>"""


def t5_type(model_name):
    return 't5' == model_name.lower() or \
        't5-' in model_name.lower() or \
        'flan-' in model_name.lower() or \
        'fastchat-t5' in model_name.lower()


def get_langchain_prompts(pre_prompt_query, prompt_query, pre_prompt_summary, prompt_summary,
                          model_name, inference_server, model_path_llama):
    if model_name and ('falcon' in model_name or
                       'Llama-2'.lower() in model_name.lower() or
                       model_path_llama and 'llama-2' in model_path_llama.lower()) or \
            model_name in [None, '']:
        # use when no model, like no --base_model
        pre_prompt_query1 = "Pay attention and remember the information below, which will help to answer the question or imperative after the context ends.\n"
        prompt_query1 = "According to only the information in the document sources provided within the context above, "
    elif inference_server and inference_server.startswith('openai'):
        pre_prompt_query1 = "Pay attention and remember the information below, which will help to answer the question or imperative after the context ends.  If the answer cannot be primarily obtained from information within the context, then respond that the answer does not appear in the context of the documents.\n"
        prompt_query1 = "According to (primarily) the information in the document sources provided within context above, "
    else:
        pre_prompt_query1 = ""
        prompt_query1 = ""

    pre_prompt_summary1 = """In order to write a concise single-paragraph or bulleted list summary, pay attention to the following text\n"""
    prompt_summary1 = "Using only the information in the document sources above, write a condensed and concise summary of key results (preferably as bullet points):\n"

    if pre_prompt_query is None:
        pre_prompt_query = pre_prompt_query1
    if prompt_query is None:
        prompt_query = prompt_query1
    if pre_prompt_summary is None:
        pre_prompt_summary = pre_prompt_summary1
    if prompt_summary is None:
        prompt_summary = prompt_summary1

    return pre_prompt_query, prompt_query, pre_prompt_summary, prompt_summary


def gr_to_lg(image_loaders,
             pdf_loaders,
             url_loaders,
             **kwargs,
             ):
    if image_loaders is None:
        image_loaders = kwargs['image_loaders_options0']
    if pdf_loaders is None:
        pdf_loaders = kwargs['pdf_loaders_options0']
    if url_loaders is None:
        url_loaders = kwargs['url_loaders_options0']
    # translate:
    # 'auto' wouldn't be used here
    ret = dict(
        # urls
        use_unstructured='Unstructured' in url_loaders,
        use_playwright='PlayWright' in url_loaders,
        use_selenium='Selenium' in url_loaders,

        # pdfs
        use_pymupdf='on' if 'PyMuPDF' in pdf_loaders else 'off',
        use_unstructured_pdf='on' if 'Unstructured' in pdf_loaders else 'off',
        use_pypdf='on' if 'PyPDF' in pdf_loaders else 'off',
        enable_pdf_ocr='on' if 'OCR' in pdf_loaders else 'off',
        enable_pdf_doctr='on' if 'DocTR' in pdf_loaders else 'off',
        try_pdf_as_html='on' if 'TryHTML' in pdf_loaders else 'off',

        # images
        enable_ocr='OCR' in image_loaders,
        enable_doctr='DocTR' in image_loaders,
        enable_pix2struct='Pix2Struct' in image_loaders,
        enable_captions='Caption' in image_loaders or 'CaptionBlip2' in image_loaders,
    )
    if 'CaptionBlip2' in image_loaders:
        # just override, don't actually do both even if user chose both
        captions_model = "Salesforce/blip2-flan-t5-xl"
    else:
        captions_model = kwargs['captions_model']
    return ret, captions_model


invalid_key_msg = 'Invalid Access Key, request access key from sales@h2o.ai or jon.mckinney@h2o.ai'

docs_ordering_types = ['best_first', 'best_near_prompt', 'reverse_ucurve_sort']