|
import streamlit as st |
|
from langchain.document_loaders import TextLoader |
|
from pypdf import PdfReader |
|
from langchain import HuggingFaceHub |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain.embeddings import HuggingFaceInstructEmbeddings |
|
from langchain.vectorstores import FAISS |
|
from langchain.chains import ConversationalRetrievalChain |
|
from langchain.memory import ConversationBufferWindowMemory |
|
|
|
|
|
def read_pdf(file): |
|
document = "" |
|
|
|
reader = PdfReader(file) |
|
for page in reader.pages: |
|
document += page.extract_text() |
|
|
|
return document |
|
|
|
class llmClass(): |
|
def __init__(self, token, llm_model, instruct_embeddings, existing_vector_store, temperature, max_length): |
|
print('Created') |
|
|
|
def generate_answer(self, question, token): |
|
return 'LLm not connected' |
|
|
|
|
|
def read_txt(file): |
|
document = str(file.getvalue()) |
|
document = document.replace("\\n", " \\n ").replace("\\r", " \\r ") |
|
|
|
return document |
|
|
|
|
|
def split_doc(document, chunk_size, chunk_overlap): |
|
|
|
splitter = RecursiveCharacterTextSplitter( |
|
chunk_size=chunk_size, |
|
chunk_overlap=chunk_overlap |
|
) |
|
split = splitter.split_text(document) |
|
split = splitter.create_documents(split) |
|
|
|
return split |
|
|
|
|
|
def embedding_storing(model_name, split, create_new_vs, existing_vector_store, new_vs_name): |
|
if create_new_vs is not None: |
|
|
|
instructor_embeddings = HuggingFaceInstructEmbeddings( |
|
model_name=model_name, model_kwargs={"device":"cuda"} |
|
) |
|
|
|
|
|
db = FAISS.from_documents(split, instructor_embeddings) |
|
|
|
if create_new_vs == True: |
|
|
|
db.save_local("vector store/" + new_vs_name) |
|
else: |
|
|
|
load_db = FAISS.load_local( |
|
"vector store/" + existing_vector_store, |
|
instructor_embeddings, |
|
allow_dangerous_deserialization=True |
|
) |
|
|
|
load_db.merge_from(db) |
|
load_db.save_local("vector store/" + new_vs_name) |
|
|
|
st.success("The document has been saved.") |
|
|
|
|
|
def prepare_rag_llm( |
|
token, llm_model, instruct_embeddings, vector_store_list, temperature, max_length |
|
): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
llm_object = llmClass(token, llm_model, instruct_embeddings, vector_store_list, temperature, max_length) |
|
|
|
return llm_object |
|
|
|
|
|
def generate_answer(question, token): |
|
return ["Not connected to An LLm", "No documentation available"] |
|
answer = "An error has occured" |
|
|
|
if token == "": |
|
answer = "Insert the Hugging Face token" |
|
doc_source = ["no source"] |
|
else: |
|
response = st.session_state.conversation({"question": question}) |
|
answer = response.get("answer").split("Helpful Answer:")[-1].strip() |
|
explanation = response.get("source_documents", []) |
|
doc_source = [d.page_content for d in explanation] |
|
|
|
return answer, doc_source |
|
|