Spaces:

whoami02
/

bot_manuals

Runtime error

App Files Files Community

whoami02 commited on Feb 14, 2024

Commit

c59f483

verified ·

1 Parent(s): 0df619a

Upload 3 files

Browse files

Files changed (4) hide show

.gitattributes +1 -0
Bot.jpg +3 -0
bot.py +202 -0
user.jpeg +0 -0

.gitattributes CHANGED Viewed

@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 db/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 db/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
+Bot.jpg filter=lfs diff=lfs merge=lfs -text

Bot.jpg ADDED Viewed

Git LFS Details

SHA256: 69f66ef4e5dfa42ee35ae4397cc630ef3e6fc749ff5efa4e3ff9f9486e938b02
Pointer size: 132 Bytes
Size of remote file: 1.05 MB

bot.py ADDED Viewed

	@@ -0,0 +1,202 @@

+import torch
+import os
+import gradio as gr
+from auto_gptq import AutoGPTQForCausalLM
+# from ctransformers import AutoModelForCausalLM, AutoConfig, Config
+from transformers import AutoTokenizer, pipeline, GenerationConfig
+from langchain_community.embeddings import HuggingFaceBgeEmbeddings
+from langchain_community.vectorstores import Chroma
+from langchain.retrievers import MultiQueryRetriever
+# from langchain.retrievers.document_compressors import LLMChainExtractor
+from langchain.chains import ConversationalRetrievalChain
+from langchain.memory import ConversationBufferWindowMemory
+from langchain_community.llms import llamacpp, huggingface_pipeline
+from langchain.prompts import PromptTemplate
+from langchain.chains import LLMChain
+from langchain.chains.question_answering import load_qa_chain
+from huggingface_hub import hf_hub_download
+from dotenv import load_dotenv
+# import os
+# os.getenv('hf_token')
+# MODEL_ID, MODEL_BASENAME = "TheBloke/zephyr-7B-beta-GGUF","zephyr-7b-beta.Q5_K_S.gguf"
+_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a
+standalone question without changing the content in given question.
+Chat History:
+{chat_history}
+Follow Up Input: {question}
+Standalone question:"""
+system_prompt = """You are a helpful assistant, you will use the provided context to answer user questions.
+Read the given context before answering questions and think step by step. If you can not answer a user question based on the provided context, inform the user.
+Do not use any other information for answering the user. Provide a detailed answer to the question."""
+load_dotenv()
+def load_quantized_model_gptq(model_id, model_basename):
+    # if ".safetensors" in model_basename:
+    #     model_basename = model_basename.replace(".safetensors", "")
+    tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, cache_dir = r"E:\AW\LLMs\models")
+    model = AutoGPTQForCausalLM.from_quantized(
+        model_id,
+        # model_basename=model_basename,
+        use_safetensors=True,
+        trust_remote_code=True,
+        device_map="auto",
+        use_triton=False,
+        cache_dir = r"E:\AW\LLMs\models"
+    )
+    generation_config = GenerationConfig.from_pretrained(model_id)
+    pipe = pipeline(
+        "text-generation",
+        model=model, #type: ignore
+        tokenizer=tokenizer,
+        max_length=20000,
+        temperature=0.7,
+        # top_p=0.95,
+        repetition_penalty=1.15,
+        generation_config=generation_config,
+    )
+    local_llm = huggingface_pipeline.HuggingFacePipeline(pipeline=pipe)
+    return local_llm
+def load_quantized_model(model_id=None):
+    MODEL_ID, MODEL_BASENAME = "TheBloke/zephyr-7B-beta-GGUF","zephyr-7b-beta.Q5_K_S.gguf"
+    # if model_id == "Zephyr-7b-Beta":
+    #     MODEL_ID, MODEL_BASENAME = "TheBloke/zephyr-7B-beta-GGUF","zephyr-7b-beta.Q5_K_S.gguf"
+    # elif model_id == "Llama-2-7b-chat":
+    #     MODEL_ID, MODEL_BASENAME = "TheBloke/Llama-2-7b-Chat-GGUF","llama-2-7b-chat.Q4_K_M.gguf"
+    try:
+        # logging.info("Using LlamaCPP for GGUF quantized model")
+        model_path = hf_hub_download(
+            repo_id=MODEL_ID,
+            filename=MODEL_BASENAME,
+            resume_download=True,
+            cache_dir = r"E:\AW\LLMs\models"
+        )
+        kwargs = {
+            'model_path': model_path,
+            'n_ctx': 10000,
+            'max_tokens': 10000,
+            'n_batch': 512,
+            # 'n_gpu_layers':6,
+        }
+        # offloading 5 layers to gpu gave ans in 6-7 mins; 3270 mb of VRAM
+        return llamacpp.LlamaCpp(**kwargs)
+    except TypeError:
+        print("Supported model architecture: Llama, Mistral")
+        return None
+def upload_files(files):
+    file_paths = [file.name for file in files]
+    return file_paths
+with gr.Blocks() as demo:
+    gr.Markdown(
+    """
+    <h2> <center> PrivateGPT </center> </h2>
+    """)
+    with gr.Row():
+        with gr.Column(scale=2): #type:ignore
+            # with gr.Column(scale=5):
+                # with gr.Row():
+                #     file_output = gr.File(label="Uploaded Documents",show_label=True)
+                # with gr.Row():
+                #     upload_button = gr.UploadButton("Click to upload files", file_types=[".pdf", ".csv", ".xlsx", ".txt"], file_count="multiple")
+                #     upload_button.upload(upload_files, upload_button, file_output)
+            with gr.Row():
+                model_id = gr.Radio(["Zephyr-7b-Beta", "Llama-2-7b-chat"], value="Llama-2-7b-chat",label="LLM Model")
+                # Temp = gr.Slider(minimum=0, maximum=5, step=0.1, info="Adjust the [random parameter] of LLM from here")
+            with gr.Row():
+                mode = gr.Radio(['Document', 'Data'], value='Document',label="QA mode")
+                # print(f"selected {model} model with {Temp} temperature")
+        persist_directory = "db"
+        embeddings = HuggingFaceBgeEmbeddings(
+            model_name = "BAAI/bge-small-en-v1.5",
+            model_kwargs={"device": "cpu"},
+            encode_kwargs = {'normalize_embeddings':True},
+            cache_folder=r"E:\AW\LLMs\models",
+        )
+        db2 = Chroma(persist_directory = persist_directory,embedding_function = embeddings)
+        # llm = load_quantized_model(model_id=model_id) #type:ignore
+        MODEL_ID = "TheBloke/Llama-2-7B-Chat-GPTQ"
+        # MODEL_I = "HuggingFaceH4/zephyr-7b-beta"
+        MODEL_BASENAME = "gptq-4bit-32g-actorder_True"
+        # ---------------------------------------------------------------------------------------------------
+        # llm = load_quantized_model_gptq(model_id=MODEL_ID, model_basename=MODEL_BASENAME)
+        llm = load_quantized_model()
+        # ---------------------------------------------------------------------------------------------------
+        condense_question_prompt_template = PromptTemplate.from_template(_template)
+        prompt_template = system_prompt + """
+            {context}
+            Question: {question}
+            Helpful Answer:"""
+        qa_prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
+        memory = ConversationBufferWindowMemory(memory_key='chat_history', k=1, return_messages=True)
+        # memory = ConversationKGMemory(llm=llm, memory_key='chat_history', return_messages=True)
+        # compressor = LLMChainExtractor.from_llm(llm=llm)
+        # compression_retriever = ContextualCompressionRetriever(
+        #     base_compressor=compressor,
+        #     base_retriever=db2.as_retriever(search_kwargs={'k':5})
+        # )
+        retriever_from_llm = MultiQueryRetriever.from_llm(
+                retriever=db2.as_retriever(search_kwargs={'k':5}),
+                llm = llm,
+                # llm = load_quantized_model(model_id="TheBloke/Llama-2-7B-Chat-GPTQ")
+        )
+        qa2 = ConversationalRetrievalChain(
+            # retriever=db.as_retriever(),
+            retriever=retriever_from_llm,
+            question_generator= LLMChain(llm=llm, prompt=condense_question_prompt_template, memory=memory, verbose=True), #type:ignore
+            combine_docs_chain=load_qa_chain(llm=llm, chain_type="stuff", prompt=qa_prompt, verbose=True), #type:ignore
+            memory=memory,
+            verbose=True,
+            # type: ignore
+        )
+        def add_text(history, text):
+            history = history + [(text, None)]
+            return history, ""
+        def bot(history):
+            res = qa2.invoke(
+                {
+                    'question': history[-1][0],
+                    'chat_history': history[:-1]
+                }
+            )
+            history[-1][1] = res['answer']
+            torch.cuda.empty_cache()
+            return history
+        with gr.Column(scale=8): # type: ignore
+            with gr.Row():
+                chatbot = gr.Chatbot([], elem_id="chatbot",label="Chat", height=500, show_label=True, avatar_images=["user.jpeg","Bot.jpg"])
+            with gr.Row():
+                with gr.Column(scale=8): # type: ignore
+                    txt = gr.Textbox(
+                        show_label=False,
+                        placeholder="Enter text and press enter",
+                        container=False,
+                    )
+                with gr.Column(scale=1): # type: ignore
+                    submit_btn = gr.Button(
+                        'Submit',
+                        variant='primary'
+                    )
+                with gr.Column(scale=1): # type: ignore
+                    clear_btn = gr.Button(
+                        'Clear',
+                        variant="stop"
+                    )
+            txt.submit(add_text, [chatbot, txt], [chatbot, txt]).then(
+                bot, chatbot, chatbot
+            )
+            submit_btn.click(add_text, [chatbot, txt], [chatbot, txt]).then(
+                bot, chatbot, chatbot
+            )
+            clear_btn.click(lambda: None, None, chatbot, queue=False)
+if __name__ == "__main__":
+    demo.queue()
+    # demo.launch(share=True)
+    demo.launch(max_threads=40)

user.jpeg ADDED Viewed