Spaces:
Runtime error
Runtime error
import torch | |
import os | |
import gradio as gr | |
from auto_gptq import AutoGPTQForCausalLM | |
# from ctransformers import AutoModelForCausalLM, AutoConfig, Config | |
from transformers import AutoTokenizer, pipeline, GenerationConfig | |
from langchain_community.embeddings import HuggingFaceBgeEmbeddings | |
from langchain_community.vectorstores import Chroma | |
from langchain.retrievers import MultiQueryRetriever | |
# from langchain.retrievers.document_compressors import LLMChainExtractor | |
from langchain.chains import ConversationalRetrievalChain | |
from langchain.memory import ConversationBufferWindowMemory | |
from langchain_community.llms import llamacpp, huggingface_pipeline | |
from langchain.prompts import PromptTemplate | |
from langchain.chains import LLMChain | |
from langchain.chains.question_answering import load_qa_chain | |
from huggingface_hub import hf_hub_download | |
from dotenv import load_dotenv | |
# import os | |
# os.getenv('hf_token') | |
# MODEL_ID, MODEL_BASENAME = "TheBloke/zephyr-7B-beta-GGUF","zephyr-7b-beta.Q5_K_S.gguf" | |
_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a | |
standalone question without changing the content in given question. | |
Chat History: | |
{chat_history} | |
Follow Up Input: {question} | |
Standalone question:""" | |
system_prompt = """You are a helpful assistant, you will use the provided context to answer user questions. | |
Read the given context before answering questions and think step by step. If you can not answer a user question based on the provided context, inform the user. | |
Do not use any other information for answering the user. Provide a detailed answer to the question.""" | |
load_dotenv() | |
def load_quantized_model_gptq(model_id, model_basename): | |
# if ".safetensors" in model_basename: | |
# model_basename = model_basename.replace(".safetensors", "") | |
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, cache_dir = r"E:\AW\LLMs\models") | |
model = AutoGPTQForCausalLM.from_quantized( | |
model_id, | |
# model_basename=model_basename, | |
use_safetensors=True, | |
trust_remote_code=True, | |
device_map="auto", | |
use_triton=False, | |
cache_dir = r"E:\AW\LLMs\models" | |
) | |
generation_config = GenerationConfig.from_pretrained(model_id) | |
pipe = pipeline( | |
"text-generation", | |
model=model, #type: ignore | |
tokenizer=tokenizer, | |
max_length=20000, | |
temperature=0.7, | |
# top_p=0.95, | |
repetition_penalty=1.15, | |
generation_config=generation_config, | |
) | |
local_llm = huggingface_pipeline.HuggingFacePipeline(pipeline=pipe) | |
return local_llm | |
def load_quantized_model(model_id=None): | |
MODEL_ID, MODEL_BASENAME = "TheBloke/zephyr-7B-beta-GGUF","zephyr-7b-beta.Q5_K_S.gguf" | |
# if model_id == "Zephyr-7b-Beta": | |
# MODEL_ID, MODEL_BASENAME = "TheBloke/zephyr-7B-beta-GGUF","zephyr-7b-beta.Q5_K_S.gguf" | |
# elif model_id == "Llama-2-7b-chat": | |
# MODEL_ID, MODEL_BASENAME = "TheBloke/Llama-2-7b-Chat-GGUF","llama-2-7b-chat.Q4_K_M.gguf" | |
try: | |
# logging.info("Using LlamaCPP for GGUF quantized model") | |
model_path = hf_hub_download( | |
repo_id=MODEL_ID, | |
filename=MODEL_BASENAME, | |
resume_download=True, | |
cache_dir = r"E:\AW\LLMs\models" | |
) | |
kwargs = { | |
'model_path': model_path, | |
'n_ctx': 10000, | |
'max_tokens': 10000, | |
'n_batch': 512, | |
# 'n_gpu_layers':6, | |
} | |
# offloading 5 layers to gpu gave ans in 6-7 mins; 3270 mb of VRAM | |
return llamacpp.LlamaCpp(**kwargs) | |
except TypeError: | |
print("Supported model architecture: Llama, Mistral") | |
return None | |
def upload_files(files): | |
file_paths = [file.name for file in files] | |
return file_paths | |
with gr.Blocks() as demo: | |
gr.Markdown( | |
""" | |
<h2> <center> PrivateGPT </center> </h2> | |
""") | |
with gr.Row(): | |
with gr.Column(scale=2): #type:ignore | |
# with gr.Column(scale=5): | |
# with gr.Row(): | |
# file_output = gr.File(label="Uploaded Documents",show_label=True) | |
# with gr.Row(): | |
# upload_button = gr.UploadButton("Click to upload files", file_types=[".pdf", ".csv", ".xlsx", ".txt"], file_count="multiple") | |
# upload_button.upload(upload_files, upload_button, file_output) | |
with gr.Row(): | |
model_id = gr.Radio(["Zephyr-7b-Beta", "Llama-2-7b-chat"], value="Llama-2-7b-chat",label="LLM Model") | |
# Temp = gr.Slider(minimum=0, maximum=5, step=0.1, info="Adjust the [random parameter] of LLM from here") | |
with gr.Row(): | |
mode = gr.Radio(['Document', 'Data'], value='Document',label="QA mode") | |
# print(f"selected {model} model with {Temp} temperature") | |
persist_directory = "db" | |
embeddings = HuggingFaceBgeEmbeddings( | |
model_name = "BAAI/bge-small-en-v1.5", | |
model_kwargs={"device": "cpu"}, | |
encode_kwargs = {'normalize_embeddings':True}, | |
cache_folder=r"E:\AW\LLMs\models", | |
) | |
db2 = Chroma(persist_directory = persist_directory,embedding_function = embeddings) | |
# llm = load_quantized_model(model_id=model_id) #type:ignore | |
MODEL_ID = "TheBloke/Llama-2-7B-Chat-GPTQ" | |
# MODEL_I = "HuggingFaceH4/zephyr-7b-beta" | |
MODEL_BASENAME = "gptq-4bit-32g-actorder_True" | |
# --------------------------------------------------------------------------------------------------- | |
# llm = load_quantized_model_gptq(model_id=MODEL_ID, model_basename=MODEL_BASENAME) | |
llm = load_quantized_model() | |
# --------------------------------------------------------------------------------------------------- | |
condense_question_prompt_template = PromptTemplate.from_template(_template) | |
prompt_template = system_prompt + """ | |
{context} | |
Question: {question} | |
Helpful Answer:""" | |
qa_prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"]) | |
memory = ConversationBufferWindowMemory(memory_key='chat_history', k=1, return_messages=True) | |
# memory = ConversationKGMemory(llm=llm, memory_key='chat_history', return_messages=True) | |
# compressor = LLMChainExtractor.from_llm(llm=llm) | |
# compression_retriever = ContextualCompressionRetriever( | |
# base_compressor=compressor, | |
# base_retriever=db2.as_retriever(search_kwargs={'k':5}) | |
# ) | |
retriever_from_llm = MultiQueryRetriever.from_llm( | |
retriever=db2.as_retriever(search_kwargs={'k':5}), | |
llm = llm, | |
# llm = load_quantized_model(model_id="TheBloke/Llama-2-7B-Chat-GPTQ") | |
) | |
qa2 = ConversationalRetrievalChain( | |
# retriever=db.as_retriever(), | |
retriever=retriever_from_llm, | |
question_generator= LLMChain(llm=llm, prompt=condense_question_prompt_template, memory=memory, verbose=True), #type:ignore | |
combine_docs_chain=load_qa_chain(llm=llm, chain_type="stuff", prompt=qa_prompt, verbose=True), #type:ignore | |
memory=memory, | |
verbose=True, | |
# type: ignore | |
) | |
def add_text(history, text): | |
history = history + [(text, None)] | |
return history, "" | |
def bot(history): | |
res = qa2.invoke( | |
{ | |
'question': history[-1][0], | |
'chat_history': history[:-1] | |
} | |
) | |
history[-1][1] = res['answer'] | |
torch.cuda.empty_cache() | |
return history | |
with gr.Column(scale=8): # type: ignore | |
with gr.Row(): | |
chatbot = gr.Chatbot([], elem_id="chatbot",label="Chat", height=500, show_label=True, avatar_images=["user.jpeg","Bot.jpg"]) | |
with gr.Row(): | |
with gr.Column(scale=8): # type: ignore | |
txt = gr.Textbox( | |
show_label=False, | |
placeholder="Enter text and press enter", | |
container=False, | |
) | |
with gr.Column(scale=1): # type: ignore | |
submit_btn = gr.Button( | |
'Submit', | |
variant='primary' | |
) | |
with gr.Column(scale=1): # type: ignore | |
clear_btn = gr.Button( | |
'Clear', | |
variant="stop" | |
) | |
txt.submit(add_text, [chatbot, txt], [chatbot, txt]).then( | |
bot, chatbot, chatbot | |
) | |
submit_btn.click(add_text, [chatbot, txt], [chatbot, txt]).then( | |
bot, chatbot, chatbot | |
) | |
clear_btn.click(lambda: None, None, chatbot, queue=False) | |
if __name__ == "__main__": | |
demo.queue() | |
# demo.launch(share=True) | |
demo.launch(max_threads=40) | |