File size: 3,806 Bytes
ff1f92b
5585965
 
 
 
 
 
f9919ad
 
 
cf0475c
5585965
 
 
 
 
 
 
ff1f92b
f5dd29d
5585965
 
 
 
 
 
ff1f92b
5585965
 
 
 
 
 
ff1f92b
5585965
ff1f92b
5585965
 
 
 
 
 
 
 
 
 
 
 
 
7446d35
 
 
 
cc21256
 
5585965
 
 
 
 
 
 
 
 
ff1f92b
5585965
cc21256
 
5585965
 
 
 
ff1f92b
 
a353273
f1e2b8d
fd18087
87177f6
fd18087
f1e2b8d
 
 
 
fd18087
 
 
 
 
25ba8c5
fd18087
 
 
 
4714d74
fd18087
 
a353273
8b94f40
 
 
 
 
 
62b5d38
ff1f92b
8b94f40
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import streamlit as st
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
import os
from PyPDF2 import PdfReader
from transformers import pipeline
from transformers import AutoModel


###########
#pip install faiss-cpu
#pip install langchain
#pip install pypdf
#pip tiktoken
#pip install InstructorEmbedding
###############


# PDF in String umwandeln
def get_pdf_text(folder_path):
    text = ""
    # Durchsuche alle Dateien im angegebenen Verzeichnis
    for filename in os.listdir(folder_path):
        filepath = os.path.join(folder_path, filename)

        # Überprüfe, ob die Datei die Erweiterung ".pdf" hat
        if os.path.isfile(filepath) and filename.lower().endswith(".pdf"):
            pdf_reader = PdfReader(filepath)
            for page in pdf_reader.pages:
                text += page.extract_text()
            #text += '\n'

    return text

#Chunks erstellen
def get_text_chunks(text):
    #Arbeitsweise Textsplitter definieren
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )
    chunks = text_splitter.split_text(text)
    return chunks

# nur zum Anlegen des lokalen Verzeichnisses "Store" und speichern der Vektor-Datenbank
def create_vectorstore_and_store():
    folder_path = './files'
    pdf_text = get_pdf_text(folder_path)
    text_chunks = get_text_chunks(pdf_text)
    embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-base")
    #embeddings = HuggingFaceInstructEmbeddings(model_name="aari1995/German_Semantic_STS_V2")
    # Initiate Faiss DB
    vectorstoreDB = FAISS.from_texts(texts=text_chunks,embedding=embeddings)#texts=text_chunks,
    # Verzeichnis in dem die VektorDB gespeichert werden soll
    save_directory = "Store"
    #VektorDB lokal speichern
    vectorstoreDB.save_local(save_directory)
    return None
    
########

def get_vectorstore():
    embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-base")
    #embeddings = HuggingFaceInstructEmbeddings(model_name="aari1995/German_Semantic_STS_V2")
    #Abruf lokaler Vektordatenbank
    save_directory = "Store"
    vectorstoreDB = FAISS.load_local(save_directory, embeddings)
    return vectorstoreDB


def get_llm_answer(user_question):
    #if os.path.exists("./Store"): #Nutzereingabe nur eingelesen, wenn vectorstore angelegt
    #user_question = st.text_area("Stell mir eine Frage: ")
            #if os.path.exists("./Store"): #Nutzereingabe nur eingelesen, wenn vectorstore angelegt
    # Retriever sucht passende Textausschnitte in den PDFs (unformatiert)
    retriever=get_vectorstore().as_retriever()
    retrieved_docs=retriever.invoke(
    user_question
    )
    # Top 3 Suchergebnisse des Retrievers als Context speichern
    context=""+retrieved_docs[0].page_content+retrieved_docs[1].page_content+retrieved_docs[2].page_content
    # Context bereinigen
    context=context.replace("\n", " ")  
    context=context.replace("- ", "")

    # Erstelle die Question Answering-Pipeline für Deutsch
    qa_pipeline = pipeline("question-answering", model="deutsche-telekom/bert-multi-english-german-squad2", tokenizer="deutsche-telekom/bert-multi-english-german-squad2")

    # Frage beantworten mit Q&A Pipeline
    answer = qa_pipeline(question=user_question, context=context, max_answer_length=200)
    
    return answer["answer"]
   
def main():
    st.set_page_config(
    page_title="Chatbot",
    layout="wide",
    initial_sidebar_state="expanded",
    )
    st.text("Chatbot Rene ist über Telegram erreichbar!")

if __name__ == '__main__':
    main()