File size: 8,098 Bytes
d60948f
1423dd4
7a72052
d60948f
1423dd4
 
d60948f
 
1423dd4
d60948f
 
 
1423dd4
 
 
 
 
 
 
 
d60948f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1423dd4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d60948f
1423dd4
 
 
 
 
 
 
 
 
 
 
d60948f
 
 
 
 
 
 
 
 
 
 
 
1423dd4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d60948f
 
 
 
 
 
1423dd4
d60948f
 
 
 
 
 
 
 
 
 
1423dd4
d60948f
 
 
 
 
 
7a72052
d60948f
 
 
 
 
 
 
 
 
1423dd4
d60948f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7a72052
d60948f
 
 
 
 
 
 
1423dd4
d60948f
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
from buster.busterbot import Buster, BusterConfig
from buster.completers import ChatGPTCompleter, DocumentAnswerer
from buster.formatters.documents import DocumentsFormatterJSON
from buster.formatters.prompts import PromptFormatter
from buster.llm_utils import get_openai_embedding_constructor
from buster.utils import extract_zip
from buster.retriever import DeepLakeRetriever, Retriever
from buster.tokenizers import GPTTokenizer
from buster.validators import Validator

from huggingface_hub import hf_hub_download

# kwargs to pass to OpenAI client
client_kwargs = {
    "timeout": 20,
    "max_retries": 3,
}

embedding_fn = get_openai_embedding_constructor(client_kwargs=client_kwargs)


HUB_DB_FILE = "deeplake_store.zip"
REPO_ID = "jerpint/hf_buster_data"

hf_hub_download(
    repo_id=REPO_ID,
    repo_type="dataset",
    filename=HUB_DB_FILE,
    local_dir=".",
)

extract_zip(zip_file_path=HUB_DB_FILE, output_path=".")


buster_cfg = BusterConfig(
    #     validator_cfg={
    #         "unknown_response_templates": [
    #             "I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?",
    #         ],
    #         "unknown_threshold": 0.85,
    #         "embedding_model": "text-embedding-ada-002",
    #         "use_reranking": True,
    #         "invalid_question_response": "This question does not seem relevant to my current knowledge.",
    #         "check_question_prompt": """You are a chatbot answering technical questions on the huggingface documentation, a library used to train and do inference on open-source artificial intelligence models.
    # Your job is to determine wether or not a question is valid, and should be answered.
    # More general questions are not considered valid, even if you might know the response.
    # Questions that are likely to be related to the huggingface library are considered valid.
    # A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.
    # For example:
    # Q: How can I train a vision model?
    # true
    # Q: What is the meaning of life?
    # false
    # A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.""",
    #         "completion_kwargs": {
    #             "model": "gpt-3.5-turbo",
    #             "stream": False,
    #             "temperature": 0,
    #         },
    #     },
    validator_cfg={
        "question_validator_cfg": {
            "invalid_question_response": "This question does not seem relevant to my current knowledge.",
            "completion_kwargs": {
                "model": "gpt-3.5-turbo",
                "stream": False,
                "temperature": 0,
            },
            "client_kwargs": client_kwargs,
            "check_question_prompt": """You are a chatbot answering technical questions on the Hugging Face documentation, a library used to train and do inference on open-source artificial intelligence models.
A user will submit a question. Your job is only to determine wether or not a question might be related to the library usage or to training AI models.
Questions that are likely to be related to the hugging face library or AI are considered valid.
A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.

For example:

Q: How can I train a vision model?
true

Q: What is the meaning of life?
false

A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.""",
        },
        "answer_validator_cfg": {
            "unknown_response_templates": [
                "I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?",
            ],
            "unknown_threshold": 0.85,
            "embedding_fn": embedding_fn,
        },
        "documents_validator_cfg": {
            "completion_kwargs": {
                "model": "gpt-3.5-turbo",
                "stream": False,
                "temperature": 0,
            },
            "client_kwargs": client_kwargs,
        },
        "use_reranking": True,
        "validate_documents": False,
    },
    retriever_cfg={
        "path": "deeplake_store",
        "top_k": 3,
        "thresh": 0.7,
        "max_tokens": 2000,
        "embedding_model": embedding_fn,
    },
    documents_answerer_cfg={
        "no_documents_message": "No documents are available for this question.",
    },
    completion_cfg={
        "completion_kwargs": {
            "model": "gpt-3.5-turbo",
            "stream": True,
            "temperature": 0,
        },
        "client_kwargs": client_kwargs,
    },
    tokenizer_cfg={
        "model_name": "gpt-3.5-turbo",
    },
    documents_formatter_cfg={
        "max_tokens": 3500,
        "columns": ["content", "source", "title"],
    },
    prompt_formatter_cfg={
        "max_tokens": 3500,
        "text_before_docs": (
            "You are an chatbot answering technical questions on the huggingface transformers library. "
            "You can only respond to a question if the content necessary to answer the question is contained in the following provided documentation. "
            "If the answer is in the documentation, summarize it in a helpful way to the user. "
            "If it isn't, simply reply that you cannot answer the question. "
            "Do not refer to the documentation directly, but use the instructions provided within it to answer questions. "
            "Here is the documentation:\n"
        ),
        "text_after_docs": (
            "REMEMBER:\n"
            "You are an chatbot answering technical questions on the huggingface transformers library. "
            "Here are the rules you must follow:\n"
            "1) You must only respond with information contained in the documentation above. Say you do not know if the information is not provided.\n"
            "2) Make sure to format your answers in Markdown format, including code block and snippets.\n"
            "3) Do not reference any links, urls or hyperlinks in your answers.\n"
            "4) Do not refer to the documentation directly, but use the instructions provided within it to answer questions. "
            "5) If you do not know the answer to a question, or if it is completely irrelevant to the library usage, simply reply with:\n"
            "'I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?'"
            "For example:\n"
            "What is the meaning of life for an qa bot?\n"
            "I'm sorry, but I am an AI language model trained to assist with questions related to the huggingface library. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with? "
            "Now answer the following question:\n"
        ),
    },
)


def setup_buster(buster_cfg: BusterConfig):
    """initialize buster with a buster_cfg class"""
    retriever: Retriever = DeepLakeRetriever(**buster_cfg.retriever_cfg)
    tokenizer = GPTTokenizer(**buster_cfg.tokenizer_cfg)
    document_answerer: DocumentAnswerer = DocumentAnswerer(
        completer=ChatGPTCompleter(**buster_cfg.completion_cfg),
        documents_formatter=DocumentsFormatterJSON(
            tokenizer=tokenizer, **buster_cfg.documents_formatter_cfg
        ),
        prompt_formatter=PromptFormatter(
            tokenizer=tokenizer, **buster_cfg.prompt_formatter_cfg
        ),
        **buster_cfg.documents_answerer_cfg,
    )
    validator: Validator = Validator(**buster_cfg.validator_cfg)
    buster: Buster = Buster(
        retriever=retriever, document_answerer=document_answerer, validator=validator
    )
    return buster