Spaces:

whoami02
/

bot_manuals

Runtime error

App Files Files Community

bot_manuals / bot.py

whoami02

Upload 3 files

c59f483 verified 9 months ago

raw

history blame

9.08 kB

	import torch
	import os
	import gradio as gr
	from auto_gptq import AutoGPTQForCausalLM
	# from ctransformers import AutoModelForCausalLM, AutoConfig, Config
	from transformers import AutoTokenizer, pipeline, GenerationConfig
	from langchain_community.embeddings import HuggingFaceBgeEmbeddings
	from langchain_community.vectorstores import Chroma
	from langchain.retrievers import MultiQueryRetriever
	# from langchain.retrievers.document_compressors import LLMChainExtractor
	from langchain.chains import ConversationalRetrievalChain
	from langchain.memory import ConversationBufferWindowMemory
	from langchain_community.llms import llamacpp, huggingface_pipeline
	from langchain.prompts import PromptTemplate
	from langchain.chains import LLMChain
	from langchain.chains.question_answering import load_qa_chain
	from huggingface_hub import hf_hub_download
	from dotenv import load_dotenv
	# import os
	# os.getenv('hf_token')
	# MODEL_ID, MODEL_BASENAME = "TheBloke/zephyr-7B-beta-GGUF","zephyr-7b-beta.Q5_K_S.gguf"
	_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a
	standalone question without changing the content in given question.
	Chat History:
	{chat_history}
	Follow Up Input: {question}
	Standalone question:"""
	system_prompt = """You are a helpful assistant, you will use the provided context to answer user questions.
	Read the given context before answering questions and think step by step. If you can not answer a user question based on the provided context, inform the user.
	Do not use any other information for answering the user. Provide a detailed answer to the question."""

	load_dotenv()

	def load_quantized_model_gptq(model_id, model_basename):
	# if ".safetensors" in model_basename:
	# model_basename = model_basename.replace(".safetensors", "")
	tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, cache_dir = r"E:\AW\LLMs\models")
	model = AutoGPTQForCausalLM.from_quantized(
	model_id,
	# model_basename=model_basename,
	use_safetensors=True,
	trust_remote_code=True,
	device_map="auto",
	use_triton=False,
	cache_dir = r"E:\AW\LLMs\models"
	)
	generation_config = GenerationConfig.from_pretrained(model_id)
	pipe = pipeline(
	"text-generation",
	model=model, #type: ignore
	tokenizer=tokenizer,
	max_length=20000,
	temperature=0.7,
	# top_p=0.95,
	repetition_penalty=1.15,
	generation_config=generation_config,
	)
	local_llm = huggingface_pipeline.HuggingFacePipeline(pipeline=pipe)
	return local_llm

	def load_quantized_model(model_id=None):
	MODEL_ID, MODEL_BASENAME = "TheBloke/zephyr-7B-beta-GGUF","zephyr-7b-beta.Q5_K_S.gguf"
	# if model_id == "Zephyr-7b-Beta":
	# MODEL_ID, MODEL_BASENAME = "TheBloke/zephyr-7B-beta-GGUF","zephyr-7b-beta.Q5_K_S.gguf"
	# elif model_id == "Llama-2-7b-chat":
	# MODEL_ID, MODEL_BASENAME = "TheBloke/Llama-2-7b-Chat-GGUF","llama-2-7b-chat.Q4_K_M.gguf"

	try:
	# logging.info("Using LlamaCPP for GGUF quantized model")
	model_path = hf_hub_download(
	repo_id=MODEL_ID,
	filename=MODEL_BASENAME,
	resume_download=True,
	cache_dir = r"E:\AW\LLMs\models"
	)
	kwargs = {
	'model_path': model_path,
	'n_ctx': 10000,
	'max_tokens': 10000,
	'n_batch': 512,
	# 'n_gpu_layers':6,
	}
	# offloading 5 layers to gpu gave ans in 6-7 mins; 3270 mb of VRAM
	return llamacpp.LlamaCpp(**kwargs)
	except TypeError:
	print("Supported model architecture: Llama, Mistral")
	return None

	def upload_files(files):
	file_paths = [file.name for file in files]
	return file_paths

	with gr.Blocks() as demo:
	gr.Markdown(
	"""
	<h2> <center> PrivateGPT </center> </h2>
	""")

	with gr.Row():
	with gr.Column(scale=2): #type:ignore
	# with gr.Column(scale=5):
	# with gr.Row():
	# file_output = gr.File(label="Uploaded Documents",show_label=True)
	# with gr.Row():
	# upload_button = gr.UploadButton("Click to upload files", file_types=[".pdf", ".csv", ".xlsx", ".txt"], file_count="multiple")
	# upload_button.upload(upload_files, upload_button, file_output)
	with gr.Row():
	model_id = gr.Radio(["Zephyr-7b-Beta", "Llama-2-7b-chat"], value="Llama-2-7b-chat",label="LLM Model")
	# Temp = gr.Slider(minimum=0, maximum=5, step=0.1, info="Adjust the [random parameter] of LLM from here")
	with gr.Row():
	mode = gr.Radio(['Document', 'Data'], value='Document',label="QA mode")
	# print(f"selected {model} model with {Temp} temperature")
	persist_directory = "db"
	embeddings = HuggingFaceBgeEmbeddings(
	model_name = "BAAI/bge-small-en-v1.5",
	model_kwargs={"device": "cpu"},
	encode_kwargs = {'normalize_embeddings':True},
	cache_folder=r"E:\AW\LLMs\models",
	)
	db2 = Chroma(persist_directory = persist_directory,embedding_function = embeddings)
	# llm = load_quantized_model(model_id=model_id) #type:ignore
	MODEL_ID = "TheBloke/Llama-2-7B-Chat-GPTQ"
	# MODEL_I = "HuggingFaceH4/zephyr-7b-beta"
	MODEL_BASENAME = "gptq-4bit-32g-actorder_True"
	# ---------------------------------------------------------------------------------------------------
	# llm = load_quantized_model_gptq(model_id=MODEL_ID, model_basename=MODEL_BASENAME)
	llm = load_quantized_model()
	# ---------------------------------------------------------------------------------------------------
	condense_question_prompt_template = PromptTemplate.from_template(_template)
	prompt_template = system_prompt + """
	{context}
	Question: {question}
	Helpful Answer:"""
	qa_prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
	memory = ConversationBufferWindowMemory(memory_key='chat_history', k=1, return_messages=True)

	# memory = ConversationKGMemory(llm=llm, memory_key='chat_history', return_messages=True)
	# compressor = LLMChainExtractor.from_llm(llm=llm)
	# compression_retriever = ContextualCompressionRetriever(
	# base_compressor=compressor,
	# base_retriever=db2.as_retriever(search_kwargs={'k':5})
	# )
	retriever_from_llm = MultiQueryRetriever.from_llm(
	retriever=db2.as_retriever(search_kwargs={'k':5}),
	llm = llm,
	# llm = load_quantized_model(model_id="TheBloke/Llama-2-7B-Chat-GPTQ")
	)
	qa2 = ConversationalRetrievalChain(
	# retriever=db.as_retriever(),
	retriever=retriever_from_llm,
	question_generator= LLMChain(llm=llm, prompt=condense_question_prompt_template, memory=memory, verbose=True), #type:ignore
	combine_docs_chain=load_qa_chain(llm=llm, chain_type="stuff", prompt=qa_prompt, verbose=True), #type:ignore
	memory=memory,
	verbose=True,
	# type: ignore
	)
	def add_text(history, text):
	history = history + [(text, None)]
	return history, ""

	def bot(history):
	res = qa2.invoke(
	{
	'question': history[-1][0],
	'chat_history': history[:-1]
	}
	)
	history[-1][1] = res['answer']
	torch.cuda.empty_cache()
	return history
	with gr.Column(scale=8): # type: ignore
	with gr.Row():
	chatbot = gr.Chatbot([], elem_id="chatbot",label="Chat", height=500, show_label=True, avatar_images=["user.jpeg","Bot.jpg"])
	with gr.Row():
	with gr.Column(scale=8): # type: ignore
	txt = gr.Textbox(
	show_label=False,
	placeholder="Enter text and press enter",
	container=False,
	)
	with gr.Column(scale=1): # type: ignore
	submit_btn = gr.Button(
	'Submit',
	variant='primary'
	)
	with gr.Column(scale=1): # type: ignore
	clear_btn = gr.Button(
	'Clear',
	variant="stop"
	)
	txt.submit(add_text, [chatbot, txt], [chatbot, txt]).then(
	bot, chatbot, chatbot
	)
	submit_btn.click(add_text, [chatbot, txt], [chatbot, txt]).then(
	bot, chatbot, chatbot
	)
	clear_btn.click(lambda: None, None, chatbot, queue=False)

	if __name__ == "__main__":
	demo.queue()
	# demo.launch(share=True)
	demo.launch(max_threads=40)