Roger Condori
change base model to falcon api
64987dd unverified
raw
history blame
10.4 kB
import gradio as gr
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.vectorstores import DocArrayInMemorySearch
from langchain.chains import RetrievalQA, ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import HuggingFaceEmbeddings
from langchain import HuggingFaceHub
from conversadocs.llamacppmodels import LlamaCpp #from langchain.llms import LlamaCpp
from huggingface_hub import hf_hub_download
import param
import os
import torch
from langchain.document_loaders import (
EverNoteLoader,
TextLoader,
UnstructuredEPubLoader,
UnstructuredHTMLLoader,
UnstructuredMarkdownLoader,
UnstructuredODTLoader,
UnstructuredPowerPointLoader,
UnstructuredWordDocumentLoader,
PyPDFLoader,
)
import gc
gc.collect()
torch.cuda.empty_cache()
#YOUR_HF_TOKEN = os.getenv("My_hf_token")
EXTENSIONS = {
".txt": (TextLoader, {"encoding": "utf8"}),
".pdf": (PyPDFLoader, {}),
".doc": (UnstructuredWordDocumentLoader, {}),
".docx": (UnstructuredWordDocumentLoader, {}),
".enex": (EverNoteLoader, {}),
".epub": (UnstructuredEPubLoader, {}),
".html": (UnstructuredHTMLLoader, {}),
".md": (UnstructuredMarkdownLoader, {}),
".odt": (UnstructuredODTLoader, {}),
".ppt": (UnstructuredPowerPointLoader, {}),
".pptx": (UnstructuredPowerPointLoader, {}),
}
#alter
def load_db(files):
# select extensions loader
documents = []
for file in files:
ext = "." + file.rsplit(".", 1)[-1]
if ext in EXTENSIONS:
loader_class, loader_args = EXTENSIONS[ext]
loader = loader_class(file, **loader_args)
documents.extend(loader.load())
else:
pass
# load documents
if documents == []:
loader_class, loader_args = EXTENSIONS['.txt']
loader = loader_class('demo_docs/demo.txt', **loader_args)
documents = loader.load()
# split documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
docs = text_splitter.split_documents(documents)
# define embedding
embeddings = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2') # all-mpnet-base-v2 #embeddings = OpenAIEmbeddings()
# create vector database from data
db = DocArrayInMemorySearch.from_documents(docs, embeddings)
return db
def q_a(db, chain_type="stuff", k=3, llm=None):
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": k})
# create a chatbot chain. Memory is managed externally.
qa = ConversationalRetrievalChain.from_llm(
llm=llm,
chain_type=chain_type,
retriever=retriever,
return_source_documents=True,
return_generated_question=True,
)
return qa
class DocChat(param.Parameterized):
chat_history = param.List([])
answer = param.String("")
db_query = param.String("")
db_response = param.List([])
k_value = param.Integer(3)
llm = None
def __init__(self, **params):
super(DocChat, self).__init__( **params)
self.loaded_file = ["demo_docs/demo.txt"]
self.db = load_db(self.loaded_file)
self.default_falcon_model(os.getenv("My_hf_token")) #self.change_llm("TheBloke/Llama-2-7B-Chat-GGML", "llama-2-7b-chat.ggmlv3.q2_K.bin", max_tokens=256, temperature=0.2, top_p=0.95, top_k=50, repeat_penalty=1.2, k=3)
self.qa = q_a(self.db, "stuff", self.k_value, self.llm)
def call_load_db(self, path_file, k):
if not os.path.exists(path_file[0]): # init or no file specified
return "No file loaded"
else:
try:
self.db = load_db(path_file)
self.loaded_file = path_file
self.qa = q_a(self.db, "stuff", k, self.llm)
self.k_value = k
#self.clr_history()
return f"New DB created and history cleared | Loaded File: {self.loaded_file}"
except:
return f'No valid file'
# chat
def convchain(self, query, k_max, recall_previous_messages):
if k_max != self.k_value:
print("Maximum querys changed")
self.qa = q_a(self.db, "stuff", k_max, self.llm)
self.k_value = k_max
if not recall_previous_messages:
self.clr_history()
try:
result = self.qa({"question": query, "chat_history": self.chat_history})
except:
print("Error not get response from model, reloaded default llama-2 7B config")
self.change_llm("TheBloke/Llama-2-7B-Chat-GGML", "llama-2-7b-chat.ggmlv3.q2_K.bin", max_tokens=256, temperature=0.2, top_p=0.95, top_k=50, repeat_penalty=1.2, k=3)
self.qa = q_a(self.db, "stuff", k_max, self.llm)
result = self.qa({"question": query, "chat_history": self.chat_history})
self.chat_history.extend([(query, result["answer"])])
self.db_query = result["generated_question"]
self.db_response = result["source_documents"]
self.answer = result['answer']
return self.answer
def summarize(self, chunk_size=2000, chunk_overlap=100):
# load docs
if "SET_LIMIT" == os.getenv("DEMO"):
return "Since the space only uses the CPU, the summarization function cannot be used."
documents = []
for file in self.loaded_file:
ext = "." + file.rsplit(".", 1)[-1]
if ext in EXTENSIONS:
loader_class, loader_args = EXTENSIONS[ext]
loader = loader_class(file, **loader_args)
documents.extend(loader.load_and_split())
if documents == []:
return "Error in summarization"
# split documents
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
separators=["\n\n", "\n", "(?<=\. )", " ", ""]
)
docs = text_splitter.split_documents(documents)
# summarize
from langchain.chains.summarize import load_summarize_chain
chain = load_summarize_chain(self.llm, chain_type='map_reduce', verbose=True)
return chain.run(docs)
def change_llm(self, repo_, file_, max_tokens=256, temperature=0.2, top_p=0.95, top_k=50, repeat_penalty=1.2, k=3):
if torch.cuda.is_available():
try:
model_path = hf_hub_download(repo_id=repo_, filename=file_)
self.qa = None
self.llm = None
gc.collect()
torch.cuda.empty_cache()
gpu_llm_layers = 35 if not '70B' in repo_.upper() else 25 # fix for 70B
self.llm = LlamaCpp(
model_path=model_path,
n_ctx=4096,
n_batch=512,
n_gpu_layers=gpu_llm_layers,
max_tokens=max_tokens,
verbose=False,
temperature=temperature,
top_p=top_p,
top_k=top_k,
repeat_penalty=repeat_penalty,
)
self.qa = q_a(self.db, "stuff", k, self.llm)
self.k_value = k
return f"Loaded {file_} [GPU INFERENCE]"
except:
self.change_llm("TheBloke/Llama-2-7B-Chat-GGML", "llama-2-7b-chat.ggmlv3.q2_K.bin", max_tokens=256, temperature=0.2, top_p=0.95, top_k=50, repeat_penalty=1.2, k=3)
return "No valid model | Reloaded Reloaded default llama-2 7B config"
else:
try:
model_path = hf_hub_download(repo_id=repo_, filename=file_)
self.qa = None
self.llm = None
gc.collect()
torch.cuda.empty_cache()
self.llm = LlamaCpp(
model_path=model_path,
n_ctx=2048,
n_batch=8,
max_tokens=max_tokens,
verbose=False,
temperature=temperature,
top_p=top_p,
top_k=top_k,
repeat_penalty=repeat_penalty,
)
self.qa = q_a(self.db, "stuff", k, self.llm)
self.k_value = k
return f"Loaded {file_} [CPU INFERENCE SLOW]"
except:
self.change_llm("TheBloke/Llama-2-7B-Chat-GGML", "llama-2-7b-chat.ggmlv3.q2_K.bin", max_tokens=256, temperature=0.2, top_p=0.95, top_k=50, repeat_penalty=1.2, k=3)
return "No valid model | Reloaded default llama-2 7B config"
def default_falcon_model(self, HF_TOKEN):
self.llm = llm_api=HuggingFaceHub(
huggingfacehub_api_token=HF_TOKEN,
repo_id="tiiuae/falcon-7b-instruct",
model_kwargs={
"temperature":0.2,
"max_new_tokens":500,
"top_k":50,
"top_p":0.95,
"repetition_penalty":1.2,
},)
self.qa = q_a(self.db, "stuff", self.k_value, self.llm)
return "Loaded model Falcon 7B-instruct [API FAST INFERENCE]"
def openai_model(self, API_KEY):
self.llm = ChatOpenAI(temperature=0, openai_api_key=API_KEY, model_name='gpt-3.5-turbo')
self.qa = q_a(self.db, "stuff", self.k_value, self.llm)
API_KEY = ""
return "Loaded model OpenAI gpt-3.5-turbo [API FAST INFERENCE]"
@param.depends('db_query ', )
def get_lquest(self):
if not self.db_query :
return print("Last question to DB: no DB accesses so far")
return self.db_query
@param.depends('db_response', )
def get_sources(self):
if not self.db_response:
return
#rlist=[f"Result of DB lookup:"]
rlist=[]
for doc in self.db_response:
for element in doc:
rlist.append(element)
return rlist
@param.depends('convchain', 'clr_history')
def get_chats(self):
if not self.chat_history:
return "No History Yet"
#rlist=[f"Current Chat History variable"]
rlist=[]
for exchange in self.chat_history:
rlist.append(exchange)
return rlist
def clr_history(self,count=0):
self.chat_history = []
return "HISTORY CLEARED"