File size: 3,087 Bytes
ff1f92b 5585965 cf0475c 5585965 ff1f92b f5dd29d 5585965 ff1f92b 5585965 ff1f92b 5585965 ff1f92b 5585965 7446d35 7f03e5a 5585965 ff1f92b 5585965 7f03e5a 5585965 ff1f92b 5585965 11c1175 7446d35 5585965 ff1f92b 5585965 ff1f92b 5585965 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
import streamlit as st
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
import os
from PyPDF2 import PdfReader
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
#from htmlTemplates import css, bot_template, user_template
from langchain.llms import HuggingFaceHub
###########
#pip install faiss-cpu
#pip install langchain
#pip install pypdf
#pip tiktoken
#pip install InstructorEmbedding
###############
# PDF in String umwandeln
def get_pdf_text(folder_path):
text = ""
# Durchsuche alle Dateien im angegebenen Verzeichnis
for filename in os.listdir(folder_path):
filepath = os.path.join(folder_path, filename)
# Überprüfe, ob die Datei die Erweiterung ".pdf" hat
if os.path.isfile(filepath) and filename.lower().endswith(".pdf"):
pdf_reader = PdfReader(filepath)
for page in pdf_reader.pages:
text += page.extract_text()
#text += '\n'
return text
#Chunks erstellen
def get_text_chunks(text):
#Arbeitsweise Textsplitter definieren
text_splitter = CharacterTextSplitter(
separator="\n",
chunk_size=1000,
chunk_overlap=200,
length_function=len
)
chunks = text_splitter.split_text(text)
return chunks
# nur zum Anlegen des lokalen Verzeichnisses "Store" und speichern der Vektor-Datenbank
def create_vectorstore_and_store():
folder_path = './files'
pdf_text = get_pdf_text(folder_path)
text_chunks = get_text_chunks(pdf_text)
embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-base")
# Initiate Faiss DB
vectorstoreDB = FAISS.from_texts(texts=text_chunks,embedding=embeddings)#texts=text_chunks,
# Verzeichnis in dem die VektorDB gespeichert werden soll
save_directory = "Store"
#VektorDB lokal speichern
vectorstoreDB.save_local(save_directory)
print(vectorstoreDB)
return None
########
def get_vectorstore():
embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-base")
#Abruf lokaler Vektordatenbank
save_directory = "Store"
vectorstoreDB = FAISS.load_local(save_directory, embeddings)
return vectorstoreDB
def main():
user_question = st.text_area("Stell mir eine Frage2: ")
create_vectorstore_and_store()
retriever=get_vectorstore().as_retriever()
retrieved_docs=retriever.invoke(
user_question
)
if user_question:
st.text(retrieved_docs[0].page_content)
# bei incoming pdf
#vectorstore_DB=get_vectorstore() # bei Abfrage durch Chatbot
#print(get_vectorstore().similarity_search_with_score("stelle")) # zeigt an ob Vektordatenbank gefüllt ist
#print(get_conversation_chain(get_vectorstore()))
if __name__ == '__main__':
main() |