|
import streamlit as st |
|
import os |
|
from langchain.chains import RetrievalQA, ConversationalRetrievalChain |
|
from langchain.chat_models import ChatOpenAI |
|
from langchain.document_loaders import TextLoader |
|
from langchain.text_splitter import CharacterTextSplitter |
|
from langchain.embeddings import OpenAIEmbeddings |
|
from langchain.vectorstores import FAISS |
|
from langchain_community.vectorstores import Qdrant |
|
|
|
from langchain.memory import ConversationBufferMemory |
|
from langchain.document_loaders import TextLoader |
|
from tempfile import NamedTemporaryFile |
|
|
|
import re |
|
def main(): |
|
|
|
st.title('Dokument-basiertes Q&A System') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
uploaded_file = st.file_uploader("Dokument hochladen", type=['txt']) |
|
if uploaded_file is not None: |
|
|
|
with NamedTemporaryFile(delete=False) as f: |
|
f.write(uploaded_file.getbuffer()) |
|
loader = TextLoader(f.name, encoding="utf-8") |
|
data = loader.load() |
|
|
|
|
|
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200) |
|
data = text_splitter.split_documents(data) |
|
|
|
|
|
|
|
embeddings = OpenAIEmbeddings() |
|
|
|
vectorstore = Qdrant.from_documents( |
|
data, |
|
embeddings, |
|
location=":memory:", |
|
collection_name="my_documents", |
|
) |
|
|
|
llm = ChatOpenAI(temperature=0.3, model_name="gpt-4-turbo") |
|
memory = ConversationBufferMemory( |
|
memory_key='chat_history', return_messages=True, output_key='answer') |
|
conversation_chain = ConversationalRetrievalChain.from_llm( |
|
llm=llm, |
|
chain_type="stuff", |
|
retriever=vectorstore.as_retriever(), |
|
memory=memory, |
|
return_source_documents=True |
|
) |
|
|
|
|
|
query = st.text_input("Frag deinen Dokumenten!") |
|
if query: |
|
systemquery = "You are a fraud analyst. You must help your colleague to answer the question below. Do not hallucinate. Provide all the relevant legal text." |
|
result = conversation_chain({"question": query}) |
|
answer = result["answer"] |
|
st.write("Antwort:", answer) |
|
st.write("Quellen:") |
|
for i in result["source_documents"]: |
|
res = re.search(r'^[^\n]*', i.page_content) |
|
st.write(i.page_content[res.span()[0]:res.span()[1]]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|
|
|
|
|
|
|