whoami02 commited on
Commit
c59f483
1 Parent(s): 0df619a

Upload 3 files

Browse files
Files changed (4) hide show
  1. .gitattributes +1 -0
  2. Bot.jpg +3 -0
  3. bot.py +202 -0
  4. user.jpeg +0 -0
.gitattributes CHANGED
@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  db/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
 
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  db/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
37
+ Bot.jpg filter=lfs diff=lfs merge=lfs -text
Bot.jpg ADDED

Git LFS Details

  • SHA256: 69f66ef4e5dfa42ee35ae4397cc630ef3e6fc749ff5efa4e3ff9f9486e938b02
  • Pointer size: 132 Bytes
  • Size of remote file: 1.05 MB
bot.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import os
3
+ import gradio as gr
4
+ from auto_gptq import AutoGPTQForCausalLM
5
+ # from ctransformers import AutoModelForCausalLM, AutoConfig, Config
6
+ from transformers import AutoTokenizer, pipeline, GenerationConfig
7
+ from langchain_community.embeddings import HuggingFaceBgeEmbeddings
8
+ from langchain_community.vectorstores import Chroma
9
+ from langchain.retrievers import MultiQueryRetriever
10
+ # from langchain.retrievers.document_compressors import LLMChainExtractor
11
+ from langchain.chains import ConversationalRetrievalChain
12
+ from langchain.memory import ConversationBufferWindowMemory
13
+ from langchain_community.llms import llamacpp, huggingface_pipeline
14
+ from langchain.prompts import PromptTemplate
15
+ from langchain.chains import LLMChain
16
+ from langchain.chains.question_answering import load_qa_chain
17
+ from huggingface_hub import hf_hub_download
18
+ from dotenv import load_dotenv
19
+ # import os
20
+ # os.getenv('hf_token')
21
+ # MODEL_ID, MODEL_BASENAME = "TheBloke/zephyr-7B-beta-GGUF","zephyr-7b-beta.Q5_K_S.gguf"
22
+ _template = """Given the following conversation and a follow up question, rephrase the follow up question to be a
23
+ standalone question without changing the content in given question.
24
+ Chat History:
25
+ {chat_history}
26
+ Follow Up Input: {question}
27
+ Standalone question:"""
28
+ system_prompt = """You are a helpful assistant, you will use the provided context to answer user questions.
29
+ Read the given context before answering questions and think step by step. If you can not answer a user question based on the provided context, inform the user.
30
+ Do not use any other information for answering the user. Provide a detailed answer to the question."""
31
+
32
+ load_dotenv()
33
+
34
+ def load_quantized_model_gptq(model_id, model_basename):
35
+ # if ".safetensors" in model_basename:
36
+ # model_basename = model_basename.replace(".safetensors", "")
37
+ tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, cache_dir = r"E:\AW\LLMs\models")
38
+ model = AutoGPTQForCausalLM.from_quantized(
39
+ model_id,
40
+ # model_basename=model_basename,
41
+ use_safetensors=True,
42
+ trust_remote_code=True,
43
+ device_map="auto",
44
+ use_triton=False,
45
+ cache_dir = r"E:\AW\LLMs\models"
46
+ )
47
+ generation_config = GenerationConfig.from_pretrained(model_id)
48
+ pipe = pipeline(
49
+ "text-generation",
50
+ model=model, #type: ignore
51
+ tokenizer=tokenizer,
52
+ max_length=20000,
53
+ temperature=0.7,
54
+ # top_p=0.95,
55
+ repetition_penalty=1.15,
56
+ generation_config=generation_config,
57
+ )
58
+ local_llm = huggingface_pipeline.HuggingFacePipeline(pipeline=pipe)
59
+ return local_llm
60
+
61
+ def load_quantized_model(model_id=None):
62
+ MODEL_ID, MODEL_BASENAME = "TheBloke/zephyr-7B-beta-GGUF","zephyr-7b-beta.Q5_K_S.gguf"
63
+ # if model_id == "Zephyr-7b-Beta":
64
+ # MODEL_ID, MODEL_BASENAME = "TheBloke/zephyr-7B-beta-GGUF","zephyr-7b-beta.Q5_K_S.gguf"
65
+ # elif model_id == "Llama-2-7b-chat":
66
+ # MODEL_ID, MODEL_BASENAME = "TheBloke/Llama-2-7b-Chat-GGUF","llama-2-7b-chat.Q4_K_M.gguf"
67
+
68
+ try:
69
+ # logging.info("Using LlamaCPP for GGUF quantized model")
70
+ model_path = hf_hub_download(
71
+ repo_id=MODEL_ID,
72
+ filename=MODEL_BASENAME,
73
+ resume_download=True,
74
+ cache_dir = r"E:\AW\LLMs\models"
75
+ )
76
+ kwargs = {
77
+ 'model_path': model_path,
78
+ 'n_ctx': 10000,
79
+ 'max_tokens': 10000,
80
+ 'n_batch': 512,
81
+ # 'n_gpu_layers':6,
82
+ }
83
+ # offloading 5 layers to gpu gave ans in 6-7 mins; 3270 mb of VRAM
84
+ return llamacpp.LlamaCpp(**kwargs)
85
+ except TypeError:
86
+ print("Supported model architecture: Llama, Mistral")
87
+ return None
88
+
89
+ def upload_files(files):
90
+ file_paths = [file.name for file in files]
91
+ return file_paths
92
+
93
+ with gr.Blocks() as demo:
94
+ gr.Markdown(
95
+ """
96
+ <h2> <center> PrivateGPT </center> </h2>
97
+ """)
98
+
99
+ with gr.Row():
100
+ with gr.Column(scale=2): #type:ignore
101
+ # with gr.Column(scale=5):
102
+ # with gr.Row():
103
+ # file_output = gr.File(label="Uploaded Documents",show_label=True)
104
+ # with gr.Row():
105
+ # upload_button = gr.UploadButton("Click to upload files", file_types=[".pdf", ".csv", ".xlsx", ".txt"], file_count="multiple")
106
+ # upload_button.upload(upload_files, upload_button, file_output)
107
+ with gr.Row():
108
+ model_id = gr.Radio(["Zephyr-7b-Beta", "Llama-2-7b-chat"], value="Llama-2-7b-chat",label="LLM Model")
109
+ # Temp = gr.Slider(minimum=0, maximum=5, step=0.1, info="Adjust the [random parameter] of LLM from here")
110
+ with gr.Row():
111
+ mode = gr.Radio(['Document', 'Data'], value='Document',label="QA mode")
112
+ # print(f"selected {model} model with {Temp} temperature")
113
+ persist_directory = "db"
114
+ embeddings = HuggingFaceBgeEmbeddings(
115
+ model_name = "BAAI/bge-small-en-v1.5",
116
+ model_kwargs={"device": "cpu"},
117
+ encode_kwargs = {'normalize_embeddings':True},
118
+ cache_folder=r"E:\AW\LLMs\models",
119
+ )
120
+ db2 = Chroma(persist_directory = persist_directory,embedding_function = embeddings)
121
+ # llm = load_quantized_model(model_id=model_id) #type:ignore
122
+ MODEL_ID = "TheBloke/Llama-2-7B-Chat-GPTQ"
123
+ # MODEL_I = "HuggingFaceH4/zephyr-7b-beta"
124
+ MODEL_BASENAME = "gptq-4bit-32g-actorder_True"
125
+ # ---------------------------------------------------------------------------------------------------
126
+ # llm = load_quantized_model_gptq(model_id=MODEL_ID, model_basename=MODEL_BASENAME)
127
+ llm = load_quantized_model()
128
+ # ---------------------------------------------------------------------------------------------------
129
+ condense_question_prompt_template = PromptTemplate.from_template(_template)
130
+ prompt_template = system_prompt + """
131
+ {context}
132
+ Question: {question}
133
+ Helpful Answer:"""
134
+ qa_prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
135
+ memory = ConversationBufferWindowMemory(memory_key='chat_history', k=1, return_messages=True)
136
+
137
+ # memory = ConversationKGMemory(llm=llm, memory_key='chat_history', return_messages=True)
138
+ # compressor = LLMChainExtractor.from_llm(llm=llm)
139
+ # compression_retriever = ContextualCompressionRetriever(
140
+ # base_compressor=compressor,
141
+ # base_retriever=db2.as_retriever(search_kwargs={'k':5})
142
+ # )
143
+ retriever_from_llm = MultiQueryRetriever.from_llm(
144
+ retriever=db2.as_retriever(search_kwargs={'k':5}),
145
+ llm = llm,
146
+ # llm = load_quantized_model(model_id="TheBloke/Llama-2-7B-Chat-GPTQ")
147
+ )
148
+ qa2 = ConversationalRetrievalChain(
149
+ # retriever=db.as_retriever(),
150
+ retriever=retriever_from_llm,
151
+ question_generator= LLMChain(llm=llm, prompt=condense_question_prompt_template, memory=memory, verbose=True), #type:ignore
152
+ combine_docs_chain=load_qa_chain(llm=llm, chain_type="stuff", prompt=qa_prompt, verbose=True), #type:ignore
153
+ memory=memory,
154
+ verbose=True,
155
+ # type: ignore
156
+ )
157
+ def add_text(history, text):
158
+ history = history + [(text, None)]
159
+ return history, ""
160
+
161
+ def bot(history):
162
+ res = qa2.invoke(
163
+ {
164
+ 'question': history[-1][0],
165
+ 'chat_history': history[:-1]
166
+ }
167
+ )
168
+ history[-1][1] = res['answer']
169
+ torch.cuda.empty_cache()
170
+ return history
171
+ with gr.Column(scale=8): # type: ignore
172
+ with gr.Row():
173
+ chatbot = gr.Chatbot([], elem_id="chatbot",label="Chat", height=500, show_label=True, avatar_images=["user.jpeg","Bot.jpg"])
174
+ with gr.Row():
175
+ with gr.Column(scale=8): # type: ignore
176
+ txt = gr.Textbox(
177
+ show_label=False,
178
+ placeholder="Enter text and press enter",
179
+ container=False,
180
+ )
181
+ with gr.Column(scale=1): # type: ignore
182
+ submit_btn = gr.Button(
183
+ 'Submit',
184
+ variant='primary'
185
+ )
186
+ with gr.Column(scale=1): # type: ignore
187
+ clear_btn = gr.Button(
188
+ 'Clear',
189
+ variant="stop"
190
+ )
191
+ txt.submit(add_text, [chatbot, txt], [chatbot, txt]).then(
192
+ bot, chatbot, chatbot
193
+ )
194
+ submit_btn.click(add_text, [chatbot, txt], [chatbot, txt]).then(
195
+ bot, chatbot, chatbot
196
+ )
197
+ clear_btn.click(lambda: None, None, chatbot, queue=False)
198
+
199
+ if __name__ == "__main__":
200
+ demo.queue()
201
+ # demo.launch(share=True)
202
+ demo.launch(max_threads=40)
user.jpeg ADDED