ErikH commited on
Commit
5585965
1 Parent(s): eafabe9

Update pages/bot.py

Browse files
Files changed (1) hide show
  1. pages/bot.py +89 -25
pages/bot.py CHANGED
@@ -1,36 +1,100 @@
1
  import streamlit as st
2
- from transformers import pipeline
3
- import datetime
4
- import pandas as pd
5
- from pathlib import Path
6
- # to-do import upload
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
 
 
 
 
 
 
8
 
9
- st.markdown("# Chatbot")
10
- st.sidebar.markdown("# Chatbot")
 
 
 
 
11
 
12
- uploaded_file = '' # PLACEHOLDER
13
 
14
- # Display file content
15
- file_content = uploaded_file.read()
16
- st.write("Dateiinhalt:")
17
- st.code(file_content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
- # User input for question
20
- user_question = st.text_input("Stellen Sie eine Frage zum hochgeladenen PDF:")
 
 
 
 
21
 
22
 
23
- # Perform Hugging Face task (e.g., question answering)
24
- if user_question:
25
- question_answering = pipeline(
26
- "question-answering",
27
- model="deepset/gelectra-base-germanquad-distilled",
28
- tokenizer="deepset/gelectra-base-germanquad-distilled"
 
 
 
 
 
29
  )
 
 
 
 
 
 
 
 
 
 
30
 
31
- # Get answer to the user's question
32
- answer = question_answering(question=user_question, context=file_content)
33
 
34
- # Display the answer to the user's question
35
- st.write(f"Antwort auf die Frage '{user_question}': {answer['answer']}")
36
- st.write("Confidence Score:", answer['score'])
 
1
  import streamlit as st
2
+ from langchain.embeddings import HuggingFaceInstructEmbeddings
3
+ from langchain.vectorstores import FAISS
4
+ from langchain.text_splitter import CharacterTextSplitter
5
+ from langchain.document_loaders import DirectoryLoader, PyPDFLoader
6
+ import os
7
+ from PyPDF2 import PdfReader
8
+ from langchain.chains import RetrievalQAWithSourcesChain
9
+ from langchain.memory import ConversationBufferMemory
10
+ from langchain.chains import ConversationalRetrievalChain
11
+ #from htmlTemplates import css, bot_template, user_template
12
+ from langchain.llms import HuggingFaceHub
13
+ from dotenv import load_dotenv
14
+ ###########
15
+ #pip install faiss-cpu
16
+ #pip install langchain
17
+ #pip install pypdf
18
+ #pip tiktoken
19
+ #pip install InstructorEmbedding
20
+ ###############
21
 
22
+ # PDF in String umwandeln
23
+ def get_pdf_text(folder_path):
24
+ text = ""
25
+ # Durchsuche alle Dateien im angegebenen Verzeichnis
26
+ for filename in os.listdir(folder_path):
27
+ filepath = os.path.join(folder_path, filename)
28
 
29
+ # Überprüfe, ob die Datei die Erweiterung ".pdf" hat
30
+ if os.path.isfile(filepath) and filename.lower().endswith(".pdf"):
31
+ pdf_reader = PdfReader(filepath)
32
+ for page in pdf_reader.pages:
33
+ text += page.extract_text()
34
+ #text += '\n'
35
 
36
+ return text
37
 
38
+ #Chunks erstellen
39
+ def get_text_chunks(text):
40
+ #Arbeitsweise Textsplitter definieren
41
+ text_splitter = CharacterTextSplitter(
42
+ separator="\n",
43
+ chunk_size=1000,
44
+ chunk_overlap=200,
45
+ length_function=len
46
+ )
47
+ chunks = text_splitter.split_text(text)
48
+ return chunks
49
+
50
+ # nur zum Anlegen des lokalen Verzeichnisses "Store" und speichern der Vektor-Datenbank
51
+ def create_vectorstore_and_store(text_chunks):
52
+ embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-base")
53
+ # Initiate Faiss DB
54
+ vectorstoreDB = FAISS.from_texts(texts=text_chunks,embedding=embeddings)#texts=text_chunks,
55
+ ###
56
+ ### --> danach soll das PDF-Verzeichnis gelöscht werden, bzw. Datein verschieben, weil beim nächsten Upload
57
+ ###
58
+ # Verzeichnis in dem die VektorDB gespeichert werden soll
59
+ save_directory = "Store"
60
+ #VektorDB lokal speichern
61
+ vectorstoreDB.save_local(save_directory)
62
+ print(vectorstoreDB)
63
+ return None
64
+
65
+ ########
66
 
67
+ def get_vectorstore():
68
+ embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-base")
69
+ #Abruf lokaler Vektordatenbank
70
+ save_directory = "Store"
71
+ vectorstoreDB = FAISS.load_local(save_directory, embeddings)
72
+ return vectorstoreDB
73
 
74
 
75
+ def main():
76
+ load_dotenv()
77
+ user_question = st.text_area("Eingabe:")
78
+ folder_path = './PDFs'
79
+ pdf_text = get_pdf_text(folder_path)
80
+ text_chunks = get_text_chunks(pdf_text)
81
+ create_vectorstore_and_store(text_chunks)
82
+
83
+ retriever=get_vectorstore().as_retriever()
84
+ retrieved_docs=retriever.invoke(
85
+ user_question
86
  )
87
+ if user_question:
88
+ st.text(retrieved_docs[0].page_content)
89
+ # bei incoming pdf
90
+
91
+ #vectorstore_DB=get_vectorstore() # bei Abfrage durch Chatbot
92
+ #print(get_vectorstore().similarity_search_with_score("stelle")) # zeigt an ob Vektordatenbank gefüllt ist
93
+
94
+ #print(get_conversation_chain(get_vectorstore()))
95
+
96
+
97
 
 
 
98
 
99
+ if __name__ == '__main__':
100
+ main()