johannes123213 commited on
Commit
ffbadfd
1 Parent(s): 3acdeb8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -103
app.py CHANGED
@@ -1,108 +1,16 @@
1
-
2
  from langchain.embeddings import HuggingFaceInstructEmbeddings
3
- from langchain.vectorstores import FAISS
4
  from langchain.text_splitter import CharacterTextSplitter
5
- from langchain.document_loaders import DirectoryLoader, PyPDFLoader
6
- import os
7
- from PyPDF2 import PdfReader
8
- from langchain.chains import RetrievalQAWithSourcesChain
9
- from langchain.memory import ConversationBufferMemory
10
- from langchain.chains import ConversationalRetrievalChain
11
- #from htmlTemplates import css, bot_template, user_template
12
- from langchain.llms import HuggingFaceHub
13
- from dotenv import load_dotenv
14
- import streamlit as st
15
- ###########
16
- #pip install faiss-cpu
17
- #pip install langchain
18
- #pip install pypdf
19
- #pip tiktoken
20
- #pip install InstructorEmbedding
21
- ###############
22
-
23
- # PDF in String umwandeln
24
- def get_pdf_text(folder_path):
25
- text = ""
26
- # Durchsuche alle Dateien im angegebenen Verzeichnis
27
- for filename in os.listdir(folder_path):
28
- filepath = os.path.join(folder_path, filename)
29
-
30
- # Überprüfe, ob die Datei die Erweiterung ".pdf" hat
31
- if os.path.isfile(filepath) and filename.lower().endswith(".pdf"):
32
- pdf_reader = PdfReader(filepath)
33
- for page in pdf_reader.pages:
34
- text += page.extract_text()
35
- #text += '\n'
36
-
37
- return text
38
-
39
- #Chunks erstellen
40
- def get_text_chunks(text):
41
- #Arbeitsweise Textsplitter definieren
42
- text_splitter = CharacterTextSplitter(
43
- separator="\n",
44
- chunk_size=1000,
45
- chunk_overlap=200,
46
- length_function=len
47
- )
48
- chunks = text_splitter.split_text(text)
49
- return chunks
50
-
51
- # nur zum Anlegen des lokalen Verzeichnisses "Store" und speichern der Vektor-Datenbank
52
- def create_vectorstore_and_store(text_chunks):
53
- embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-base")
54
- # Initiate Faiss DB
55
- vectorstoreDB = FAISS.from_texts(texts=text_chunks,embedding=embeddings)#texts=text_chunks,
56
- ###
57
- ### --> danach soll das PDF-Verzeichnis gelöscht werden, bzw. Datein verschieben, weil beim nächsten Upload
58
- ###
59
- # Verzeichnis in dem die VektorDB gespeichert werden soll
60
- save_directory = "Store"
61
- #VektorDB lokal speichern
62
- vectorstoreDB.save_local(save_directory)
63
- print(vectorstoreDB)
64
- return None
65
-
66
- ########
67
-
68
- def get_vectorstore():
69
- embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-base")
70
- #Abruf lokaler Vektordatenbank
71
- save_directory = "Store"
72
- vectorstoreDB = FAISS.load_local(save_directory, embeddings)
73
- return vectorstoreDB
74
-
75
-
76
- def get_conversation_chain(vectorstore):
77
- llm = HuggingFaceHub(repo_id="google/flan-t5-base", model_kwargs={"temperature":0.5, "max_length":512})
78
-
79
- memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
80
- memory.save_context({"input": "hi"}, {"output": "whats up"})
81
- conversation_chain = ConversationalRetrievalChain.from_llm(
82
- llm=llm,
83
- retriever=vectorstore.as_retriever(),
84
- memory=memory
85
- )
86
- return conversation_chain
87
-
88
- def handle_userinput(user_question):
89
- response = conversation({'question': user_question})
90
- chat_history = response['chat_history']
91
-
92
-
93
 
94
- def main():
95
- load_dotenv()
96
- user_question = st.text_area("Enter Question")
97
- folder_path = './PDFs'
98
- pdf_text = get_pdf_text(folder_path)
99
- text_chunks = get_text_chunks(pdf_text)
100
 
101
- #create_vectorstore_and_store(text_chunks) # bei incoming pdf
 
 
102
 
103
- #vectorstore_DB=get_vectorstore() # bei Abfrage durch Chatbot
104
- out = get_vectorstore().similarity_search_with_score("stelle") # zeigt vectorestore an
105
- conv_chain = get_conversation_chain(out)
106
- st.json(conv_chain)
107
- if __name__ == '__main__':
108
- main()
 
 
1
  from langchain.embeddings import HuggingFaceInstructEmbeddings
 
2
  from langchain.text_splitter import CharacterTextSplitter
3
+ from langchain.vectorstores import Chroma
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
+ full_text = open("state_of_the_union.txt", "r").read()
6
+ text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
7
+ texts = text_splitter.split_text(full_text)
 
 
 
8
 
9
+ embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
10
+ db = Chroma.from_texts(texts, embeddings)
11
+ retriever = db.as_retriever()
12
 
13
+ retrieved_docs = retriever.invoke(
14
+ "What did the president say about Ketanji Brown Jackson?"
15
+ )
16
+ print(retrieved_docs[0].page_content