Spaces:
Sleeping
Sleeping
# Load libraries and dependencies | |
from pypdf import PdfReader | |
from langchain.text_splitter import CharacterTextSplitter | |
from langchain.embeddings.openai import OpenAIEmbeddings | |
from langchain.embeddings.huggingface import HuggingFaceInstructEmbeddings | |
from langchain.vectorstores import FAISS | |
from langchain.chat_models import ChatOpenAI | |
from langchain.memory import ConversationBufferMemory | |
from langchain.chains import ConversationalRetrievalChain | |
# Transform a list of PDFs into a single string | |
def get_pdf_text(pdf_documents): | |
# Initialize line of text | |
text = "" | |
# Append text extracted from the documents into the text string | |
for pdf in pdf_documents: | |
pdf_reader = PdfReader(pdf, strict=True) | |
for page in pdf_reader.pages: | |
text += page.extract_text() | |
return text | |
# Transform a single line of text into an array of text chunks | |
def get_text_chunks(raw_text, separator="\n", chunk_size=1000, chunk_overlap=200, lenght_function=len): | |
# Initialize TextSplitter with default variables | |
text_splitter = CharacterTextSplitter( | |
separator=separator, | |
chunk_size=chunk_size, | |
chunk_overlap=chunk_overlap, | |
length_function=lenght_function | |
) | |
# Create list of text chunks | |
return text_splitter.split_text(raw_text) | |
# Initialize embeddings | |
def init_embeddings(type=1): | |
# Choose embeding depending on the project's necessities | |
if type == 1: | |
# OpenAI Embeddings | |
return OpenAIEmbeddings() | |
else: | |
# Instructor Embeddings | |
return HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl") | |
# Initialize Conversation Chain | |
def get_conversation_chain(chunks, embeddings): | |
# Create Vector Database from text chunks and embeddings | |
knowledge_base = FAISS.from_texts(chunks, embeddings).as_retriever() | |
# Create buffer to store the conversation memory | |
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True) | |
# Initialize language model | |
language_model = ChatOpenAI() | |
# Create conversation chain | |
conversation_chain = ConversationalRetrievalChain.from_llm( | |
llm=language_model, | |
retriever=knowledge_base, | |
memory=memory | |
) | |
return conversation_chain |