rohan13's picture
add pdf files
e0086ee
raw
history blame
8.52 kB
import os
import pickle
import faiss
import langchain
from langchain import HuggingFaceHub
from langchain.cache import InMemoryCache
from langchain.chains import ConversationalRetrievalChain
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import DirectoryLoader, TextLoader, UnstructuredHTMLLoader, PyPDFDirectoryLoader
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceHubEmbeddings
from langchain.memory import ConversationBufferWindowMemory
from langchain.prompts.chat import (
ChatPromptTemplate,
HumanMessagePromptTemplate,
SystemMessagePromptTemplate,
)
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores.faiss import FAISS
from mapping import FILE_URL_MAPPING
from memory import CustomMongoDBChatMessageHistory
langchain.llm_cache = InMemoryCache()
global model_name
models = ["GPT-3.5", "Flan UL2", "GPT-4", "Flan T5"]
pickle_file = "_vs.pkl"
index_file = "_vs.index"
models_folder = "models/"
MONGO_DB_URL = os.environ['MONGO_DB_URL']
llm = ChatOpenAI(model_name="gpt-4", temperature=0.1)
embeddings = OpenAIEmbeddings(model='text-embedding-ada-002')
message_history = CustomMongoDBChatMessageHistory(
connection_string=MONGO_DB_URL, session_id='session_id', database_name='coursera_bots',
collection_name='3d_printing_applications'
)
memory = ConversationBufferWindowMemory(memory_key="chat_history", k=4)
vectorstore_index = None
system_template = """You are Coursera QA Bot. Have a conversation with a human, answering the following questions as best you can.
You are a teaching assistant for a Coursera Course: 3D Printing Applications and can answer any question about that using vectorstore or context.
Use the following pieces of context to answer the users question.
----------------
{context}"""
messages = [
SystemMessagePromptTemplate.from_template(system_template),
HumanMessagePromptTemplate.from_template("{question}"),
]
CHAT_PROMPT = ChatPromptTemplate.from_messages(messages)
def set_session_id(session_id):
global message_history, memory
# check if message_history with same session id exists
if message_history.session_id == session_id:
print("Session id already set: " + str(message_history.session_id))
else:
# create new message history with session id
print("Setting session id to " + str(session_id))
message_history = CustomMongoDBChatMessageHistory(
connection_string=MONGO_DB_URL, session_id=session_id, database_name='coursera_bots',
collection_name='printing_3d_applications'
)
memory = ConversationBufferWindowMemory(memory_key="chat_history", chat_memory=message_history, k=10,
return_messages=True)
def set_model_and_embeddings(model):
set_model(model)
# set_embeddings(model)
def set_model(model):
global llm
print("Setting model to " + str(model))
if model == "GPT-3.5":
print("Loading GPT-3.5")
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.1)
elif model == "GPT-4":
print("Loading GPT-4")
llm = ChatOpenAI(model_name="gpt-4", temperature=0.1)
elif model == "Flan UL2":
print("Loading Flan-UL2")
llm = HuggingFaceHub(repo_id="google/flan-ul2", model_kwargs={"temperature": 0.1, "max_new_tokens": 500})
elif model == "Flan T5":
print("Loading Flan T5")
llm = HuggingFaceHub(repo_id="google/flan-t5-base", model_kwargs={"temperature": 0.1})
else:
print("Loading GPT-3.5 from else")
llm = ChatOpenAI(model_name="text-davinci-002", temperature=0.1)
def set_embeddings(model):
global embeddings
if model == "GPT-3.5" or model == "GPT-4":
print("Loading OpenAI embeddings")
embeddings = OpenAIEmbeddings(model='text-embedding-ada-002')
elif model == "Flan UL2" or model == "Flan T5":
print("Loading Hugging Face embeddings")
embeddings = HuggingFaceHubEmbeddings(repo_id="sentence-transformers/all-MiniLM-L6-v2")
def get_search_index(model):
global vectorstore_index
if os.path.isfile(get_file_path(model, pickle_file)) and os.path.isfile(
get_file_path(model, index_file)) and os.path.getsize(get_file_path(model, pickle_file)) > 0:
# Load index from pickle file
with open(get_file_path(model, pickle_file), "rb") as f:
search_index = pickle.load(f)
print("Loaded index")
else:
search_index = create_index(model)
print("Created index")
vectorstore_index = search_index
return search_index
def create_index(model):
source_chunks = create_chunk_documents()
search_index = search_index_from_docs(source_chunks)
faiss.write_index(search_index.index, get_file_path(model, index_file))
# Save index to pickle file
with open(get_file_path(model, pickle_file), "wb") as f:
pickle.dump(search_index, f)
return search_index
def get_file_path(model, file):
# If model is GPT3.5 or GPT4 return models_folder + openai + file else return models_folder + hf + file
if model == "GPT-3.5" or model == "GPT-4":
return models_folder + "openai" + file
else:
return models_folder + "hf" + file
def search_index_from_docs(source_chunks):
# print("source chunks: " + str(len(source_chunks)))
# print("embeddings: " + str(embeddings))
search_index = FAISS.from_documents(source_chunks, embeddings)
return search_index
def get_pdf_files():
loader = PyPDFDirectoryLoader('docs', glob="**/*.pdf", recursive=True)
document_list = loader.load()
return document_list
def get_html_files():
loader = DirectoryLoader('docs', glob="**/*.html", loader_cls=UnstructuredHTMLLoader, recursive=True)
document_list = loader.load()
return document_list
def fetch_data_for_embeddings():
document_list = get_text_files()
document_list.extend(get_html_files())
document_list.extend(get_pdf_files())
# use file_url_mapping to set metadata of document to url which has been set as the source
for document in document_list:
document.metadata["url"] = FILE_URL_MAPPING.get(document.metadata["source"])
print("document list: " + str(len(document_list)))
return document_list
def get_text_files():
loader = DirectoryLoader('docs', glob="**/*.txt", loader_cls=TextLoader, recursive=True)
document_list = loader.load()
return document_list
def create_chunk_documents():
sources = fetch_data_for_embeddings()
splitter = CharacterTextSplitter(separator=" ", chunk_size=800, chunk_overlap=0)
source_chunks = splitter.split_documents(sources)
print("chunks: " + str(len(source_chunks)))
return source_chunks
def get_qa_chain(vectorstore_index):
global llm
print(llm)
# embeddings_filter = EmbeddingsFilter(embeddings=embeddings, similarity_threshold=0.76)
# compression_retriever = ContextualCompressionRetriever(base_compressor=embeddings_filter, base_retriever=gpt_3_5_index.as_retriever())
retriever = vectorstore_index.as_retriever(search_type="similarity_score_threshold",
search_kwargs={"score_threshold": .76})
chain = ConversationalRetrievalChain.from_llm(llm, retriever, return_source_documents=True,
verbose=True,
combine_docs_chain_kwargs={"prompt": CHAT_PROMPT})
return chain
def get_chat_history(inputs) -> str:
res = []
for human, ai in inputs:
res.append(f"Human:{human}\nAI:{ai}")
return "\n".join(res)
def generate_answer(question) -> str:
global vectorstore_index
chain = get_qa_chain(vectorstore_index)
# get last 4 messages from chat history
history = memory.chat_memory.messages[-4:]
result = chain(
{"question": question, "chat_history": history})
save_chat_history(question, result)
sources = []
print(result)
for document in result['source_documents']:
sources.append("\n" + document.metadata['url'])
print(sources)
source = ',\n'.join(set(sources))
return result['answer'] + '\nSOURCES: ' + source
def save_chat_history(question, result):
memory.chat_memory.add_user_message(question)
memory.chat_memory.add_ai_message(result["answer"])
print("chat history after saving: " + str(memory.chat_memory.messages))