whoami02 commited on
Commit
35fa3f4
β€’
1 Parent(s): c59f483

Rename bot.py to app.py

Browse files
Files changed (1) hide show
  1. bot.py β†’ app.py +5 -68
bot.py β†’ app.py RENAMED
@@ -1,13 +1,8 @@
1
- import torch
2
  import os
3
  import gradio as gr
4
- from auto_gptq import AutoGPTQForCausalLM
5
- # from ctransformers import AutoModelForCausalLM, AutoConfig, Config
6
- from transformers import AutoTokenizer, pipeline, GenerationConfig
7
  from langchain_community.embeddings import HuggingFaceBgeEmbeddings
8
  from langchain_community.vectorstores import Chroma
9
  from langchain.retrievers import MultiQueryRetriever
10
- # from langchain.retrievers.document_compressors import LLMChainExtractor
11
  from langchain.chains import ConversationalRetrievalChain
12
  from langchain.memory import ConversationBufferWindowMemory
13
  from langchain_community.llms import llamacpp, huggingface_pipeline
@@ -16,9 +11,7 @@ from langchain.chains import LLMChain
16
  from langchain.chains.question_answering import load_qa_chain
17
  from huggingface_hub import hf_hub_download
18
  from dotenv import load_dotenv
19
- # import os
20
- # os.getenv('hf_token')
21
- # MODEL_ID, MODEL_BASENAME = "TheBloke/zephyr-7B-beta-GGUF","zephyr-7b-beta.Q5_K_S.gguf"
22
  _template = """Given the following conversation and a follow up question, rephrase the follow up question to be a
23
  standalone question without changing the content in given question.
24
  Chat History:
@@ -31,42 +24,9 @@ Do not use any other information for answering the user. Provide a detailed answ
31
 
32
  load_dotenv()
33
 
34
- def load_quantized_model_gptq(model_id, model_basename):
35
- # if ".safetensors" in model_basename:
36
- # model_basename = model_basename.replace(".safetensors", "")
37
- tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, cache_dir = r"E:\AW\LLMs\models")
38
- model = AutoGPTQForCausalLM.from_quantized(
39
- model_id,
40
- # model_basename=model_basename,
41
- use_safetensors=True,
42
- trust_remote_code=True,
43
- device_map="auto",
44
- use_triton=False,
45
- cache_dir = r"E:\AW\LLMs\models"
46
- )
47
- generation_config = GenerationConfig.from_pretrained(model_id)
48
- pipe = pipeline(
49
- "text-generation",
50
- model=model, #type: ignore
51
- tokenizer=tokenizer,
52
- max_length=20000,
53
- temperature=0.7,
54
- # top_p=0.95,
55
- repetition_penalty=1.15,
56
- generation_config=generation_config,
57
- )
58
- local_llm = huggingface_pipeline.HuggingFacePipeline(pipeline=pipe)
59
- return local_llm
60
-
61
  def load_quantized_model(model_id=None):
62
  MODEL_ID, MODEL_BASENAME = "TheBloke/zephyr-7B-beta-GGUF","zephyr-7b-beta.Q5_K_S.gguf"
63
- # if model_id == "Zephyr-7b-Beta":
64
- # MODEL_ID, MODEL_BASENAME = "TheBloke/zephyr-7B-beta-GGUF","zephyr-7b-beta.Q5_K_S.gguf"
65
- # elif model_id == "Llama-2-7b-chat":
66
- # MODEL_ID, MODEL_BASENAME = "TheBloke/Llama-2-7b-Chat-GGUF","llama-2-7b-chat.Q4_K_M.gguf"
67
-
68
  try:
69
- # logging.info("Using LlamaCPP for GGUF quantized model")
70
  model_path = hf_hub_download(
71
  repo_id=MODEL_ID,
72
  filename=MODEL_BASENAME,
@@ -80,7 +40,6 @@ def load_quantized_model(model_id=None):
80
  'n_batch': 512,
81
  # 'n_gpu_layers':6,
82
  }
83
- # offloading 5 layers to gpu gave ans in 6-7 mins; 3270 mb of VRAM
84
  return llamacpp.LlamaCpp(**kwargs)
85
  except TypeError:
86
  print("Supported model architecture: Llama, Mistral")
@@ -97,33 +56,21 @@ with gr.Blocks() as demo:
97
  """)
98
 
99
  with gr.Row():
100
- with gr.Column(scale=2): #type:ignore
101
- # with gr.Column(scale=5):
102
- # with gr.Row():
103
- # file_output = gr.File(label="Uploaded Documents",show_label=True)
104
- # with gr.Row():
105
- # upload_button = gr.UploadButton("Click to upload files", file_types=[".pdf", ".csv", ".xlsx", ".txt"], file_count="multiple")
106
- # upload_button.upload(upload_files, upload_button, file_output)
107
  with gr.Row():
108
  model_id = gr.Radio(["Zephyr-7b-Beta", "Llama-2-7b-chat"], value="Llama-2-7b-chat",label="LLM Model")
109
- # Temp = gr.Slider(minimum=0, maximum=5, step=0.1, info="Adjust the [random parameter] of LLM from here")
110
  with gr.Row():
111
  mode = gr.Radio(['Document', 'Data'], value='Document',label="QA mode")
112
- # print(f"selected {model} model with {Temp} temperature")
113
  persist_directory = "db"
114
  embeddings = HuggingFaceBgeEmbeddings(
115
  model_name = "BAAI/bge-small-en-v1.5",
116
  model_kwargs={"device": "cpu"},
117
  encode_kwargs = {'normalize_embeddings':True},
118
- cache_folder=r"E:\AW\LLMs\models",
119
  )
120
  db2 = Chroma(persist_directory = persist_directory,embedding_function = embeddings)
121
  # llm = load_quantized_model(model_id=model_id) #type:ignore
122
- MODEL_ID = "TheBloke/Llama-2-7B-Chat-GPTQ"
123
- # MODEL_I = "HuggingFaceH4/zephyr-7b-beta"
124
- MODEL_BASENAME = "gptq-4bit-32g-actorder_True"
125
  # ---------------------------------------------------------------------------------------------------
126
- # llm = load_quantized_model_gptq(model_id=MODEL_ID, model_basename=MODEL_BASENAME)
127
  llm = load_quantized_model()
128
  # ---------------------------------------------------------------------------------------------------
129
  condense_question_prompt_template = PromptTemplate.from_template(_template)
@@ -133,20 +80,11 @@ with gr.Blocks() as demo:
133
  Helpful Answer:"""
134
  qa_prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
135
  memory = ConversationBufferWindowMemory(memory_key='chat_history', k=1, return_messages=True)
136
-
137
- # memory = ConversationKGMemory(llm=llm, memory_key='chat_history', return_messages=True)
138
- # compressor = LLMChainExtractor.from_llm(llm=llm)
139
- # compression_retriever = ContextualCompressionRetriever(
140
- # base_compressor=compressor,
141
- # base_retriever=db2.as_retriever(search_kwargs={'k':5})
142
- # )
143
  retriever_from_llm = MultiQueryRetriever.from_llm(
144
  retriever=db2.as_retriever(search_kwargs={'k':5}),
145
  llm = llm,
146
- # llm = load_quantized_model(model_id="TheBloke/Llama-2-7B-Chat-GPTQ")
147
  )
148
  qa2 = ConversationalRetrievalChain(
149
- # retriever=db.as_retriever(),
150
  retriever=retriever_from_llm,
151
  question_generator= LLMChain(llm=llm, prompt=condense_question_prompt_template, memory=memory, verbose=True), #type:ignore
152
  combine_docs_chain=load_qa_chain(llm=llm, chain_type="stuff", prompt=qa_prompt, verbose=True), #type:ignore
@@ -168,7 +106,7 @@ with gr.Blocks() as demo:
168
  history[-1][1] = res['answer']
169
  torch.cuda.empty_cache()
170
  return history
171
- with gr.Column(scale=8): # type: ignore
172
  with gr.Row():
173
  chatbot = gr.Chatbot([], elem_id="chatbot",label="Chat", height=500, show_label=True, avatar_images=["user.jpeg","Bot.jpg"])
174
  with gr.Row():
@@ -198,5 +136,4 @@ with gr.Blocks() as demo:
198
 
199
  if __name__ == "__main__":
200
  demo.queue()
201
- # demo.launch(share=True)
202
- demo.launch(max_threads=40)
 
 
1
  import os
2
  import gradio as gr
 
 
 
3
  from langchain_community.embeddings import HuggingFaceBgeEmbeddings
4
  from langchain_community.vectorstores import Chroma
5
  from langchain.retrievers import MultiQueryRetriever
 
6
  from langchain.chains import ConversationalRetrievalChain
7
  from langchain.memory import ConversationBufferWindowMemory
8
  from langchain_community.llms import llamacpp, huggingface_pipeline
 
11
  from langchain.chains.question_answering import load_qa_chain
12
  from huggingface_hub import hf_hub_download
13
  from dotenv import load_dotenv
14
+
 
 
15
  _template = """Given the following conversation and a follow up question, rephrase the follow up question to be a
16
  standalone question without changing the content in given question.
17
  Chat History:
 
24
 
25
  load_dotenv()
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  def load_quantized_model(model_id=None):
28
  MODEL_ID, MODEL_BASENAME = "TheBloke/zephyr-7B-beta-GGUF","zephyr-7b-beta.Q5_K_S.gguf"
 
 
 
 
 
29
  try:
 
30
  model_path = hf_hub_download(
31
  repo_id=MODEL_ID,
32
  filename=MODEL_BASENAME,
 
40
  'n_batch': 512,
41
  # 'n_gpu_layers':6,
42
  }
 
43
  return llamacpp.LlamaCpp(**kwargs)
44
  except TypeError:
45
  print("Supported model architecture: Llama, Mistral")
 
56
  """)
57
 
58
  with gr.Row():
59
+ with gr.Column(scale=1):
 
 
 
 
 
 
60
  with gr.Row():
61
  model_id = gr.Radio(["Zephyr-7b-Beta", "Llama-2-7b-chat"], value="Llama-2-7b-chat",label="LLM Model")
 
62
  with gr.Row():
63
  mode = gr.Radio(['Document', 'Data'], value='Document',label="QA mode")
 
64
  persist_directory = "db"
65
  embeddings = HuggingFaceBgeEmbeddings(
66
  model_name = "BAAI/bge-small-en-v1.5",
67
  model_kwargs={"device": "cpu"},
68
  encode_kwargs = {'normalize_embeddings':True},
69
+ cache_folder="models",
70
  )
71
  db2 = Chroma(persist_directory = persist_directory,embedding_function = embeddings)
72
  # llm = load_quantized_model(model_id=model_id) #type:ignore
 
 
 
73
  # ---------------------------------------------------------------------------------------------------
 
74
  llm = load_quantized_model()
75
  # ---------------------------------------------------------------------------------------------------
76
  condense_question_prompt_template = PromptTemplate.from_template(_template)
 
80
  Helpful Answer:"""
81
  qa_prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
82
  memory = ConversationBufferWindowMemory(memory_key='chat_history', k=1, return_messages=True)
 
 
 
 
 
 
 
83
  retriever_from_llm = MultiQueryRetriever.from_llm(
84
  retriever=db2.as_retriever(search_kwargs={'k':5}),
85
  llm = llm,
 
86
  )
87
  qa2 = ConversationalRetrievalChain(
 
88
  retriever=retriever_from_llm,
89
  question_generator= LLMChain(llm=llm, prompt=condense_question_prompt_template, memory=memory, verbose=True), #type:ignore
90
  combine_docs_chain=load_qa_chain(llm=llm, chain_type="stuff", prompt=qa_prompt, verbose=True), #type:ignore
 
106
  history[-1][1] = res['answer']
107
  torch.cuda.empty_cache()
108
  return history
109
+ with gr.Column(scale=9): # type: ignore
110
  with gr.Row():
111
  chatbot = gr.Chatbot([], elem_id="chatbot",label="Chat", height=500, show_label=True, avatar_images=["user.jpeg","Bot.jpg"])
112
  with gr.Row():
 
136
 
137
  if __name__ == "__main__":
138
  demo.queue()
139
+ demo.launch(max_threads=40, debug=True)