import torch import os import gradio as gr from auto_gptq import AutoGPTQForCausalLM # from ctransformers import AutoModelForCausalLM, AutoConfig, Config from transformers import AutoTokenizer, pipeline, GenerationConfig from langchain_community.embeddings import HuggingFaceBgeEmbeddings from langchain_community.vectorstores import Chroma from langchain.retrievers import MultiQueryRetriever # from langchain.retrievers.document_compressors import LLMChainExtractor from langchain.chains import ConversationalRetrievalChain from langchain.memory import ConversationBufferWindowMemory from langchain_community.llms import llamacpp, huggingface_pipeline from langchain.prompts import PromptTemplate from langchain.chains import LLMChain from langchain.chains.question_answering import load_qa_chain from huggingface_hub import hf_hub_download from dotenv import load_dotenv # import os # os.getenv('hf_token') # MODEL_ID, MODEL_BASENAME = "TheBloke/zephyr-7B-beta-GGUF","zephyr-7b-beta.Q5_K_S.gguf" _template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question without changing the content in given question. Chat History: {chat_history} Follow Up Input: {question} Standalone question:""" system_prompt = """You are a helpful assistant, you will use the provided context to answer user questions. Read the given context before answering questions and think step by step. If you can not answer a user question based on the provided context, inform the user. Do not use any other information for answering the user. Provide a detailed answer to the question.""" load_dotenv() def load_quantized_model_gptq(model_id, model_basename): # if ".safetensors" in model_basename: # model_basename = model_basename.replace(".safetensors", "") tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, cache_dir = r"E:\AW\LLMs\models") model = AutoGPTQForCausalLM.from_quantized( model_id, # model_basename=model_basename, use_safetensors=True, trust_remote_code=True, device_map="auto", use_triton=False, cache_dir = r"E:\AW\LLMs\models" ) generation_config = GenerationConfig.from_pretrained(model_id) pipe = pipeline( "text-generation", model=model, #type: ignore tokenizer=tokenizer, max_length=20000, temperature=0.7, # top_p=0.95, repetition_penalty=1.15, generation_config=generation_config, ) local_llm = huggingface_pipeline.HuggingFacePipeline(pipeline=pipe) return local_llm def load_quantized_model(model_id=None): MODEL_ID, MODEL_BASENAME = "TheBloke/zephyr-7B-beta-GGUF","zephyr-7b-beta.Q5_K_S.gguf" # if model_id == "Zephyr-7b-Beta": # MODEL_ID, MODEL_BASENAME = "TheBloke/zephyr-7B-beta-GGUF","zephyr-7b-beta.Q5_K_S.gguf" # elif model_id == "Llama-2-7b-chat": # MODEL_ID, MODEL_BASENAME = "TheBloke/Llama-2-7b-Chat-GGUF","llama-2-7b-chat.Q4_K_M.gguf" try: # logging.info("Using LlamaCPP for GGUF quantized model") model_path = hf_hub_download( repo_id=MODEL_ID, filename=MODEL_BASENAME, resume_download=True, cache_dir = r"E:\AW\LLMs\models" ) kwargs = { 'model_path': model_path, 'n_ctx': 10000, 'max_tokens': 10000, 'n_batch': 512, # 'n_gpu_layers':6, } # offloading 5 layers to gpu gave ans in 6-7 mins; 3270 mb of VRAM return llamacpp.LlamaCpp(**kwargs) except TypeError: print("Supported model architecture: Llama, Mistral") return None def upload_files(files): file_paths = [file.name for file in files] return file_paths with gr.Blocks() as demo: gr.Markdown( """

PrivateGPT

""") with gr.Row(): with gr.Column(scale=2): #type:ignore # with gr.Column(scale=5): # with gr.Row(): # file_output = gr.File(label="Uploaded Documents",show_label=True) # with gr.Row(): # upload_button = gr.UploadButton("Click to upload files", file_types=[".pdf", ".csv", ".xlsx", ".txt"], file_count="multiple") # upload_button.upload(upload_files, upload_button, file_output) with gr.Row(): model_id = gr.Radio(["Zephyr-7b-Beta", "Llama-2-7b-chat"], value="Llama-2-7b-chat",label="LLM Model") # Temp = gr.Slider(minimum=0, maximum=5, step=0.1, info="Adjust the [random parameter] of LLM from here") with gr.Row(): mode = gr.Radio(['Document', 'Data'], value='Document',label="QA mode") # print(f"selected {model} model with {Temp} temperature") persist_directory = "db" embeddings = HuggingFaceBgeEmbeddings( model_name = "BAAI/bge-small-en-v1.5", model_kwargs={"device": "cpu"}, encode_kwargs = {'normalize_embeddings':True}, cache_folder=r"E:\AW\LLMs\models", ) db2 = Chroma(persist_directory = persist_directory,embedding_function = embeddings) # llm = load_quantized_model(model_id=model_id) #type:ignore MODEL_ID = "TheBloke/Llama-2-7B-Chat-GPTQ" # MODEL_I = "HuggingFaceH4/zephyr-7b-beta" MODEL_BASENAME = "gptq-4bit-32g-actorder_True" # --------------------------------------------------------------------------------------------------- # llm = load_quantized_model_gptq(model_id=MODEL_ID, model_basename=MODEL_BASENAME) llm = load_quantized_model() # --------------------------------------------------------------------------------------------------- condense_question_prompt_template = PromptTemplate.from_template(_template) prompt_template = system_prompt + """ {context} Question: {question} Helpful Answer:""" qa_prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"]) memory = ConversationBufferWindowMemory(memory_key='chat_history', k=1, return_messages=True) # memory = ConversationKGMemory(llm=llm, memory_key='chat_history', return_messages=True) # compressor = LLMChainExtractor.from_llm(llm=llm) # compression_retriever = ContextualCompressionRetriever( # base_compressor=compressor, # base_retriever=db2.as_retriever(search_kwargs={'k':5}) # ) retriever_from_llm = MultiQueryRetriever.from_llm( retriever=db2.as_retriever(search_kwargs={'k':5}), llm = llm, # llm = load_quantized_model(model_id="TheBloke/Llama-2-7B-Chat-GPTQ") ) qa2 = ConversationalRetrievalChain( # retriever=db.as_retriever(), retriever=retriever_from_llm, question_generator= LLMChain(llm=llm, prompt=condense_question_prompt_template, memory=memory, verbose=True), #type:ignore combine_docs_chain=load_qa_chain(llm=llm, chain_type="stuff", prompt=qa_prompt, verbose=True), #type:ignore memory=memory, verbose=True, # type: ignore ) def add_text(history, text): history = history + [(text, None)] return history, "" def bot(history): res = qa2.invoke( { 'question': history[-1][0], 'chat_history': history[:-1] } ) history[-1][1] = res['answer'] torch.cuda.empty_cache() return history with gr.Column(scale=8): # type: ignore with gr.Row(): chatbot = gr.Chatbot([], elem_id="chatbot",label="Chat", height=500, show_label=True, avatar_images=["user.jpeg","Bot.jpg"]) with gr.Row(): with gr.Column(scale=8): # type: ignore txt = gr.Textbox( show_label=False, placeholder="Enter text and press enter", container=False, ) with gr.Column(scale=1): # type: ignore submit_btn = gr.Button( 'Submit', variant='primary' ) with gr.Column(scale=1): # type: ignore clear_btn = gr.Button( 'Clear', variant="stop" ) txt.submit(add_text, [chatbot, txt], [chatbot, txt]).then( bot, chatbot, chatbot ) submit_btn.click(add_text, [chatbot, txt], [chatbot, txt]).then( bot, chatbot, chatbot ) clear_btn.click(lambda: None, None, chatbot, queue=False) if __name__ == "__main__": demo.queue() # demo.launch(share=True) demo.launch(max_threads=40)