bot_manuals / bot.py
whoami02's picture
Upload 3 files
c59f483 verified
raw
history blame
9.08 kB
import torch
import os
import gradio as gr
from auto_gptq import AutoGPTQForCausalLM
# from ctransformers import AutoModelForCausalLM, AutoConfig, Config
from transformers import AutoTokenizer, pipeline, GenerationConfig
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.retrievers import MultiQueryRetriever
# from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferWindowMemory
from langchain_community.llms import llamacpp, huggingface_pipeline
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.chains.question_answering import load_qa_chain
from huggingface_hub import hf_hub_download
from dotenv import load_dotenv
# import os
# os.getenv('hf_token')
# MODEL_ID, MODEL_BASENAME = "TheBloke/zephyr-7B-beta-GGUF","zephyr-7b-beta.Q5_K_S.gguf"
_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a
standalone question without changing the content in given question.
Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""
system_prompt = """You are a helpful assistant, you will use the provided context to answer user questions.
Read the given context before answering questions and think step by step. If you can not answer a user question based on the provided context, inform the user.
Do not use any other information for answering the user. Provide a detailed answer to the question."""
load_dotenv()
def load_quantized_model_gptq(model_id, model_basename):
# if ".safetensors" in model_basename:
# model_basename = model_basename.replace(".safetensors", "")
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, cache_dir = r"E:\AW\LLMs\models")
model = AutoGPTQForCausalLM.from_quantized(
model_id,
# model_basename=model_basename,
use_safetensors=True,
trust_remote_code=True,
device_map="auto",
use_triton=False,
cache_dir = r"E:\AW\LLMs\models"
)
generation_config = GenerationConfig.from_pretrained(model_id)
pipe = pipeline(
"text-generation",
model=model, #type: ignore
tokenizer=tokenizer,
max_length=20000,
temperature=0.7,
# top_p=0.95,
repetition_penalty=1.15,
generation_config=generation_config,
)
local_llm = huggingface_pipeline.HuggingFacePipeline(pipeline=pipe)
return local_llm
def load_quantized_model(model_id=None):
MODEL_ID, MODEL_BASENAME = "TheBloke/zephyr-7B-beta-GGUF","zephyr-7b-beta.Q5_K_S.gguf"
# if model_id == "Zephyr-7b-Beta":
# MODEL_ID, MODEL_BASENAME = "TheBloke/zephyr-7B-beta-GGUF","zephyr-7b-beta.Q5_K_S.gguf"
# elif model_id == "Llama-2-7b-chat":
# MODEL_ID, MODEL_BASENAME = "TheBloke/Llama-2-7b-Chat-GGUF","llama-2-7b-chat.Q4_K_M.gguf"
try:
# logging.info("Using LlamaCPP for GGUF quantized model")
model_path = hf_hub_download(
repo_id=MODEL_ID,
filename=MODEL_BASENAME,
resume_download=True,
cache_dir = r"E:\AW\LLMs\models"
)
kwargs = {
'model_path': model_path,
'n_ctx': 10000,
'max_tokens': 10000,
'n_batch': 512,
# 'n_gpu_layers':6,
}
# offloading 5 layers to gpu gave ans in 6-7 mins; 3270 mb of VRAM
return llamacpp.LlamaCpp(**kwargs)
except TypeError:
print("Supported model architecture: Llama, Mistral")
return None
def upload_files(files):
file_paths = [file.name for file in files]
return file_paths
with gr.Blocks() as demo:
gr.Markdown(
"""
<h2> <center> PrivateGPT </center> </h2>
""")
with gr.Row():
with gr.Column(scale=2): #type:ignore
# with gr.Column(scale=5):
# with gr.Row():
# file_output = gr.File(label="Uploaded Documents",show_label=True)
# with gr.Row():
# upload_button = gr.UploadButton("Click to upload files", file_types=[".pdf", ".csv", ".xlsx", ".txt"], file_count="multiple")
# upload_button.upload(upload_files, upload_button, file_output)
with gr.Row():
model_id = gr.Radio(["Zephyr-7b-Beta", "Llama-2-7b-chat"], value="Llama-2-7b-chat",label="LLM Model")
# Temp = gr.Slider(minimum=0, maximum=5, step=0.1, info="Adjust the [random parameter] of LLM from here")
with gr.Row():
mode = gr.Radio(['Document', 'Data'], value='Document',label="QA mode")
# print(f"selected {model} model with {Temp} temperature")
persist_directory = "db"
embeddings = HuggingFaceBgeEmbeddings(
model_name = "BAAI/bge-small-en-v1.5",
model_kwargs={"device": "cpu"},
encode_kwargs = {'normalize_embeddings':True},
cache_folder=r"E:\AW\LLMs\models",
)
db2 = Chroma(persist_directory = persist_directory,embedding_function = embeddings)
# llm = load_quantized_model(model_id=model_id) #type:ignore
MODEL_ID = "TheBloke/Llama-2-7B-Chat-GPTQ"
# MODEL_I = "HuggingFaceH4/zephyr-7b-beta"
MODEL_BASENAME = "gptq-4bit-32g-actorder_True"
# ---------------------------------------------------------------------------------------------------
# llm = load_quantized_model_gptq(model_id=MODEL_ID, model_basename=MODEL_BASENAME)
llm = load_quantized_model()
# ---------------------------------------------------------------------------------------------------
condense_question_prompt_template = PromptTemplate.from_template(_template)
prompt_template = system_prompt + """
{context}
Question: {question}
Helpful Answer:"""
qa_prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
memory = ConversationBufferWindowMemory(memory_key='chat_history', k=1, return_messages=True)
# memory = ConversationKGMemory(llm=llm, memory_key='chat_history', return_messages=True)
# compressor = LLMChainExtractor.from_llm(llm=llm)
# compression_retriever = ContextualCompressionRetriever(
# base_compressor=compressor,
# base_retriever=db2.as_retriever(search_kwargs={'k':5})
# )
retriever_from_llm = MultiQueryRetriever.from_llm(
retriever=db2.as_retriever(search_kwargs={'k':5}),
llm = llm,
# llm = load_quantized_model(model_id="TheBloke/Llama-2-7B-Chat-GPTQ")
)
qa2 = ConversationalRetrievalChain(
# retriever=db.as_retriever(),
retriever=retriever_from_llm,
question_generator= LLMChain(llm=llm, prompt=condense_question_prompt_template, memory=memory, verbose=True), #type:ignore
combine_docs_chain=load_qa_chain(llm=llm, chain_type="stuff", prompt=qa_prompt, verbose=True), #type:ignore
memory=memory,
verbose=True,
# type: ignore
)
def add_text(history, text):
history = history + [(text, None)]
return history, ""
def bot(history):
res = qa2.invoke(
{
'question': history[-1][0],
'chat_history': history[:-1]
}
)
history[-1][1] = res['answer']
torch.cuda.empty_cache()
return history
with gr.Column(scale=8): # type: ignore
with gr.Row():
chatbot = gr.Chatbot([], elem_id="chatbot",label="Chat", height=500, show_label=True, avatar_images=["user.jpeg","Bot.jpg"])
with gr.Row():
with gr.Column(scale=8): # type: ignore
txt = gr.Textbox(
show_label=False,
placeholder="Enter text and press enter",
container=False,
)
with gr.Column(scale=1): # type: ignore
submit_btn = gr.Button(
'Submit',
variant='primary'
)
with gr.Column(scale=1): # type: ignore
clear_btn = gr.Button(
'Clear',
variant="stop"
)
txt.submit(add_text, [chatbot, txt], [chatbot, txt]).then(
bot, chatbot, chatbot
)
submit_btn.click(add_text, [chatbot, txt], [chatbot, txt]).then(
bot, chatbot, chatbot
)
clear_btn.click(lambda: None, None, chatbot, queue=False)
if __name__ == "__main__":
demo.queue()
# demo.launch(share=True)
demo.launch(max_threads=40)