# Load libraries and dependencies from pypdf import PdfReader from langchain.text_splitter import CharacterTextSplitter from langchain.embeddings.openai import OpenAIEmbeddings from langchain.embeddings.huggingface import HuggingFaceInstructEmbeddings from langchain.vectorstores import FAISS from langchain.chat_models import ChatOpenAI from langchain.memory import ConversationBufferMemory from langchain.chains import ConversationalRetrievalChain # Transform a list of PDFs into a single string def get_pdf_text(pdf_documents): # Initialize line of text text = "" # Append text extracted from the documents into the text string for pdf in pdf_documents: pdf_reader = PdfReader(pdf, strict=True) for page in pdf_reader.pages: text += page.extract_text() return text # Transform a single line of text into an array of text chunks def get_text_chunks(raw_text, separator="\n", chunk_size=1000, chunk_overlap=200, lenght_function=len): # Initialize TextSplitter with default variables text_splitter = CharacterTextSplitter( separator=separator, chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=lenght_function ) # Create list of text chunks return text_splitter.split_text(raw_text) # Initialize embeddings def init_embeddings(type=1): # Choose embeding depending on the project's necessities if type == 1: # OpenAI Embeddings return OpenAIEmbeddings() else: # Instructor Embeddings return HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl") # Initialize Conversation Chain def get_conversation_chain(chunks, embeddings): # Create Vector Database from text chunks and embeddings knowledge_base = FAISS.from_texts(chunks, embeddings).as_retriever() # Create buffer to store the conversation memory memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True) # Initialize language model language_model = ChatOpenAI() # Create conversation chain conversation_chain = ConversationalRetrievalChain.from_llm( llm=language_model, retriever=knowledge_base, memory=memory ) return conversation_chain