File size: 4,791 Bytes
ff1f92b 5585965 900a2bf b884de1 cf0475c 565faf3 6126668 8c1aead 565faf3 5585965 ff1f92b f5dd29d 5585965 ff1f92b 5585965 6debecb 8809b14 a52aa9f 8809b14 5585965 ff1f92b 5585965 7446d35 a9e1591 cc21256 5585965 ff1f92b 5585965 a9e1591 cc21256 5585965 ff1f92b 6126668 5585965 f1e2b8d 87177f6 f1e2b8d 71068ec 565faf3 171a569 71068ec 7de699a c94507e 7de699a 71068ec 565faf3 18e53c5 989cff4 18e53c5 7de699a 18e53c5 0fc73f9 7de699a 989cff4 0bb8b3e 989cff4 7de699a 4eaef39 266e137 b041f09 a0bdf98 0bb8b3e 5585965 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
import streamlit as st
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
import os
from PyPDF2 import PdfReader
from transformers import pipeline
from transformers import AutoModel
#Retriever erweiterung
from langchain.prompts import ChatPromptTemplate
from langchain.schema import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough
from langchain.chains import ConversationalRetrievalChain
from langchain.llms import HuggingFaceHub
###########
#pip install faiss-cpu
#pip install langchain
#pip install pypdf
#pip tiktoken
#pip install InstructorEmbedding
###############
# PDF in String umwandeln
def get_pdf_text(folder_path):
text = ""
# Durchsuche alle Dateien im angegebenen Verzeichnis
for filename in os.listdir(folder_path):
filepath = os.path.join(folder_path, filename)
# Überprüfe, ob die Datei die Erweiterung ".pdf" hat
if os.path.isfile(filepath) and filename.lower().endswith(".pdf"):
pdf_reader = PdfReader(filepath)
for page in pdf_reader.pages:
text += page.extract_text()
#text += '\n'
translator = pipeline("translation_de_to_en", model="t5-small")
text=text.replace("\n", " ")
text=text.replace("- ", "")
st.text(text)
text=translator(""+text)
st.text(text)
return text
#Chunks erstellen
def get_text_chunks(text):
#Arbeitsweise Textsplitter definieren
text_splitter = CharacterTextSplitter(
separator="\n",
chunk_size=1000,
chunk_overlap=200,
length_function=len
)
chunks = text_splitter.split_text(text)
return chunks
# nur zum Anlegen des lokalen Verzeichnisses "Store" und speichern der Vektor-Datenbank
def create_vectorstore_and_store():
folder_path = './files'
pdf_text = get_pdf_text(folder_path)
text_chunks = get_text_chunks(pdf_text)
embeddings = HuggingFaceInstructEmbeddings(model_name="deutsche-telekom/bert-multi-english-german-squad2")
#embeddings = HuggingFaceInstructEmbeddings(model_name="aari1995/German_Semantic_STS_V2")
# Initiate Faiss DB
vectorstoreDB = FAISS.from_texts(texts=text_chunks,embedding=embeddings)#texts=text_chunks,
# Verzeichnis in dem die VektorDB gespeichert werden soll
save_directory = "Store"
#VektorDB lokal speichern
vectorstoreDB.save_local(save_directory)
print(vectorstoreDB)
return None
########
def get_vectorstore():
embeddings = HuggingFaceInstructEmbeddings(model_name="deutsche-telekom/bert-multi-english-german-squad2")
#embeddings = HuggingFaceInstructEmbeddings(model_name="aari1995/German_Semantic_STS_V2")
#Abruf lokaler Vektordatenbank
save_directory = "Store"
vectorstoreDB = FAISS.load_local(save_directory, embeddings)
return vectorstoreDB
######
#####
def main():
#if os.path.exists("./Store"): #Nutzereingabe nur eingelesen, wenn vectorstore angelegt
user_question = st.text_area("Stell mir eine Frage: ")
#if os.path.exists("./Store"): #Nutzereingabe nur eingelesen, wenn vectorstore angelegt
retriever=get_vectorstore().as_retriever()
retrieved_docs=retriever.invoke(
user_question
)
if user_question:
question=user_question
st.text(user_question)
context=""+retrieved_docs[0].page_content+retrieved_docs[1].page_content+retrieved_docs[3].page_content
context=context.replace("\n", " ")
context=context.replace("- ", "")
st.text("Das ist der Textausschnitt der durch den Retriever herausgesucht wird:")
st.text(context)
# Erstelle die Question Answering-Pipeline für Deutsch
qa_pipeline = pipeline("question-answering", model="deutsche-telekom/bert-multi-english-german-squad2", tokenizer="deutsche-telekom/bert-multi-english-german-squad2")
# Frage beantworten
#answer = qa_pipeline(question=question, context=context, top_k=3)
answer = qa_pipeline(question=question, context=context)
# Gib die Antwort aus
st.text("Basisantwort:")
st.text(answer["answer"])
st.text(answer)
"""
#Die Basisantwort müsste man jetzt ausformulieren
text2text_generator = pipeline("text2text-generation", model="google/flan-t5-xxl")
#newText=text2text_generator(question=question, context=answer)
newText=text2text_generator("Formuliere einen neuen Satz. Frage: "+question+ " Antwort: " + answer["answer"])
st.text(newText)
"""
if __name__ == '__main__':
main() |