File size: 3,235 Bytes
ff1f92b
5585965
 
 
 
 
 
 
 
 
 
 
cf0475c
5585965
 
 
 
 
 
 
ff1f92b
f5dd29d
5585965
 
 
 
 
 
ff1f92b
5585965
 
 
 
 
 
ff1f92b
5585965
ff1f92b
5585965
 
 
 
 
 
 
 
 
 
 
 
 
 
7f03e5a
5585965
 
 
 
 
 
 
 
 
 
 
 
 
ff1f92b
5585965
7f03e5a
5585965
 
 
 
ff1f92b
 
5585965
11c1175
321a086
5585965
 
 
 
 
 
 
ff1f92b
5585965
 
 
 
 
 
 
 
 
 
ff1f92b
 
5585965
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import streamlit as st
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
import os
from PyPDF2 import PdfReader
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
#from htmlTemplates import css, bot_template, user_template
from langchain.llms import HuggingFaceHub

###########
#pip install faiss-cpu
#pip install langchain
#pip install pypdf
#pip tiktoken
#pip install InstructorEmbedding
###############


# PDF in String umwandeln
def get_pdf_text(folder_path):
    text = ""
    # Durchsuche alle Dateien im angegebenen Verzeichnis
    for filename in os.listdir(folder_path):
        filepath = os.path.join(folder_path, filename)

        # Überprüfe, ob die Datei die Erweiterung ".pdf" hat
        if os.path.isfile(filepath) and filename.lower().endswith(".pdf"):
            pdf_reader = PdfReader(filepath)
            for page in pdf_reader.pages:
                text += page.extract_text()
            #text += '\n'

    return text

#Chunks erstellen
def get_text_chunks(text):
    #Arbeitsweise Textsplitter definieren
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )
    chunks = text_splitter.split_text(text)
    return chunks

# nur zum Anlegen des lokalen Verzeichnisses "Store" und speichern der Vektor-Datenbank
def create_vectorstore_and_store(text_chunks):
    embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-base")
    # Initiate Faiss DB
    vectorstoreDB = FAISS.from_texts(texts=text_chunks,embedding=embeddings)#texts=text_chunks,
    ###
    ### --> danach soll das PDF-Verzeichnis gelöscht werden, bzw. Datein verschieben, weil beim nächsten Upload 
    ###
    # Verzeichnis in dem die VektorDB gespeichert werden soll
    save_directory = "Store"
    #VektorDB lokal speichern
    vectorstoreDB.save_local(save_directory)
    print(vectorstoreDB)
    return None
    
########

def get_vectorstore():
    embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-base")
    #Abruf lokaler Vektordatenbank
    save_directory = "Store"
    vectorstoreDB = FAISS.load_local(save_directory, embeddings)
    return vectorstoreDB


def main():
    user_question = st.text_area("Stell mir eine Frage2: ")
    folder_path = './files'
    pdf_text = get_pdf_text(folder_path)
    text_chunks = get_text_chunks(pdf_text)
    create_vectorstore_and_store(text_chunks)
    
    retriever=get_vectorstore().as_retriever()
    retrieved_docs=retriever.invoke(
    user_question
    )
    if user_question:
        st.text(retrieved_docs[0].page_content)
          # bei incoming pdf

    #vectorstore_DB=get_vectorstore()        # bei Abfrage durch Chatbot
    #print(get_vectorstore().similarity_search_with_score("stelle")) # zeigt an ob Vektordatenbank gefüllt ist
    
    #print(get_conversation_chain(get_vectorstore()))




if __name__ == '__main__':
    main()