elia-waefler commited on
Commit
fcac63a
·
1 Parent(s): cff6e97

init files, idea

Browse files
.idea/.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Default ignored files
2
+ /shelf/
3
+ /workspace.xml
.idea/inspectionProfiles/Project_Default.xml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <component name="InspectionProjectProfileManager">
2
+ <profile version="1.0">
3
+ <option name="myName" value="Project Default" />
4
+ <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
5
+ <option name="ignoredPackages">
6
+ <value>
7
+ <list size="1">
8
+ <item index="0" class="java.lang.String" itemvalue="faiss" />
9
+ </list>
10
+ </value>
11
+ </option>
12
+ </inspection_tool>
13
+ <inspection_tool class="PyPep8Inspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
14
+ <option name="ignoredErrors">
15
+ <list>
16
+ <option value="E265" />
17
+ </list>
18
+ </option>
19
+ </inspection_tool>
20
+ </profile>
21
+ </component>
.idea/inspectionProfiles/profiles_settings.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <component name="InspectionProjectProfileManager">
2
+ <settings>
3
+ <option name="USE_PROJECT_PROFILE" value="false" />
4
+ <version value="1.0" />
5
+ </settings>
6
+ </component>
.idea/misc.xml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (reverse-RAG)" project-jdk-type="Python SDK" />
4
+ </project>
.idea/modules.xml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectModuleManager">
4
+ <modules>
5
+ <module fileurl="file://$PROJECT_DIR$/.idea/reverse-RAG.iml" filepath="$PROJECT_DIR$/.idea/reverse-RAG.iml" />
6
+ </modules>
7
+ </component>
8
+ </project>
.idea/reverse-RAG.iml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <module type="PYTHON_MODULE" version="4">
3
+ <component name="NewModuleRootManager">
4
+ <content url="file://$MODULE_DIR$">
5
+ <excludeFolder url="file://$MODULE_DIR$/venv" />
6
+ </content>
7
+ <orderEntry type="inheritedJdk" />
8
+ <orderEntry type="sourceFolder" forTests="false" />
9
+ </component>
10
+ <component name="PyDocumentationSettings">
11
+ <option name="format" value="GOOGLE" />
12
+ <option name="myDocStringFormat" value="Google" />
13
+ </component>
14
+ </module>
.idea/vcs.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="VcsDirectoryMappings">
4
+ <mapping directory="$PROJECT_DIR$" vcs="Git" />
5
+ </component>
6
+ </project>
app.py CHANGED
@@ -1,197 +1,3 @@
1
- import streamlit as st
2
- import os
3
- # import openai
4
- from PyPDF2 import PdfReader
5
- from openai import OpenAI
6
- from langchain.chat_models import ChatOpenAI
7
-
8
- ASK_ASH_PASSWORD = os.environ["ASK_ASH_PASSWORD"]
9
-
10
-
11
- def gpt4_new(prompt_text):
12
- client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))
13
- response = client.chat.completions.create(
14
- model="gpt-4",
15
- messages=[{"role": "system",
16
- "content": "Du bist eine Maschine, auf Grund des Texts von PDF-Dokumenten,"
17
- "das Dokument in vorgegebene Kategorien klassifiziert."
18
- "Du gibts möglichst kurze Antworten, am besten ein Wort"
19
- "Du gibst keine Erklärungen oder Begründungen. "
20
- "Du klassifizierst nur nach den vorgegebenen Kategorien."
21
- "Wenn ein Dokument partout nicht klassifizierbar ist, "
22
- "antwortest du mit '<no classification>'"},
23
- {"role": "user", "content": prompt_text}])
24
- return response.choices[0].message.content
25
-
26
-
27
- # Define a function to ask a question to GPT-4
28
- def ask_gpt4(question):
29
- print(question) # we don't have to submit the question?
30
- try:
31
- # Use the chat function to send a message and get a response
32
- response = ChatOpenAI()
33
- # Extract the response text
34
- return response["choices"][0]["message"]["content"]
35
- except Exception as e:
36
- # Handle exceptions that may occur during the API call
37
- return str(e)
38
-
39
-
40
- def process_prompts_and_save(my_prompts):
41
- # Ensure the responses list is empty initially
42
- responses = []
43
-
44
- # Loop through each prompt in the list
45
- for prompt in my_prompts:
46
- try:
47
- # ADD LOGIC TO READ FILE AND CLASSIFY
48
- # Generate response for each prompt and append to the list
49
- response = ask_gpt4(prompt)
50
- sol = f"{prompt}\n\n{response}\n\n\n\n"
51
- print(sol)
52
- responses.append(sol)
53
- except Exception as e:
54
- # In case of an error, log the error with the prompt
55
- responses.append(f"{prompt}\n\nError:{str(e)}\n\n\n\n")
56
-
57
- # Writing all responses to a text file
58
- with open('gpt4_responses.txt', 'w', encoding='utf-8') as file:
59
- file.writelines(responses)
60
-
61
-
62
- def get_pdfs_text(pdf_docs):
63
- text = ""
64
- for pdf in pdf_docs:
65
- pdf_reader = PdfReader(pdf)
66
- for page in pdf_reader.pages:
67
- text += page.extract_text()
68
- return text
69
-
70
-
71
- def get_pdf_text(pdf_document):
72
- text = ""
73
- pdf_reader = PdfReader(pdf_document)
74
- for page in pdf_reader.pages:
75
- text += page.extract_text()
76
- return text
77
-
78
-
79
- def json_open(filename):
80
- with open(filename, "r") as f:
81
- mydata = f.read()
82
- return mydata
83
-
84
-
85
- def main():
86
- st.title("Doc Classifier")
87
- l, r = st.columns(2)
88
- if st.toggle("show README"):
89
- st.subheader("Funktion: ")
90
- st.write("der Doc Classifier von Elia Wäfler kann einige der BIM2FM Dokumente")
91
- st.write("des ASH nach Disziplin, Doc typ. und Geschoss (später KBOB) klassifizieren.")
92
- st.write("lade ein oder mehrere PDF-Dokumente hoch, um es auszuprobieren.")
93
- st.write("Feedback und Bugs gerne an elia.waefler@insel.ch")
94
- st.write("Vielen Dank.")
95
- st.write("")
96
- with l:
97
- st.subheader("Limitationen: ")
98
- st.write("bisher nur PDFs")
99
- st.write("nur Disziplin, Doc typ. und Geschoss")
100
- st.write("macht teilweise Fehler, vor allem bei Koordination, Datennetz usw, (unklare Disziplinen)")
101
- st.write("")
102
- with r:
103
- st.subheader("geplante Erweiterungen:")
104
- st.write("Text Beschreibung wird von AI hinzugefügt")
105
- st.write("jpg, bilder, tabellen, .xlsx, .docx alles möglich, nicht nur PDF/Text")
106
- st.write("Ecodomus API einbinden, um alle Dokumente zu überprüfen.")
107
-
108
- if st.text_input("ASK_ASH_PASSWORD: ", type="password") == ASK_ASH_PASSWORD:
109
- uploaded_files = st.file_uploader("PDF Dokument", accept_multiple_files=True)
110
- #print(uploaded_file)
111
- #print(uploaded_file.name)
112
-
113
- if st.button("classify KBOB!"):
114
- if uploaded_files is not None:
115
- with st.container():
116
- # col1, col2, col3, col4, col5 = st.columns(5)
117
- col1, col2, col3 = st.columns(3)
118
- all_metadata = []
119
- with col1:
120
- st.write("Disziplin")
121
- st.write(f"")
122
- with col2:
123
- st.write("Dokumententyp")
124
- st.write(f"")
125
- with col3:
126
- st.write("Geschoss")
127
- st.write(f"")
128
-
129
- for file in uploaded_files:
130
- metadata = [file.name]
131
- with col1:
132
- with st.spinner("GPT4 at work"):
133
- pdf_text = str(get_pdf_text(file))
134
- prompt_1 = auftrag_0 + auftrag_1_disziplin + str(Baubranchen_Disziplinen) + pdf_text
135
- answer_1 = gpt4_new(prompt_1)
136
- print(prompt_1)
137
- metadata.append(answer_1)
138
- st.write(answer_1)
139
-
140
- with col2:
141
- with st.spinner("GPT4 at work"):
142
- prompt_2 = auftrag_0 + auftrag_1_type + str(Dokumententypen) + pdf_text
143
- answer_2 = gpt4_new(prompt_2)
144
- print(prompt_2)
145
- metadata.append(answer_2)
146
-
147
- st.write(answer_2)
148
-
149
- with col3:
150
- with st.spinner("GPT4 at work"):
151
- prompt_3 = auftrag_0 + auftrag_1_ge + str(ASH_Geschosse) + pdf_text
152
- answer_3 = gpt4_new(prompt_3)
153
- print(prompt_3)
154
- metadata.append(answer_2)
155
-
156
- st.write(answer_3)
157
-
158
- all_metadata.append(metadata)
159
-
160
- metadata_filename = "ai_generated_metadata.txt"
161
- with open(metadata_filename, 'w', encoding='utf-8') as f:
162
- for line in all_metadata:
163
- f.writelines("\n")
164
- for item in line:
165
- f.writelines(item)
166
- f.writelines(";")
167
-
168
- f.writelines("\n")
169
-
170
- st.success("classified, saved")
171
- st.download_button(f"Download Metadata", json_open(metadata_filename), file_name=metadata_filename)
172
- else:
173
- st.warning("no file")
174
-
175
-
176
- if __name__ == "__main__":
177
- #prompts = ["classify the document, tell me the ", "hello"]
178
- #process_prompts_and_save(prompts)
179
- auftrag_0 = "Klassifiziere dieses Dokument nach "
180
- auftrag_1_disziplin = "diesen 'Baubranchen Disziplinen': "
181
- auftrag_1_type = "diesen 'Dokumententypen': "
182
- auftrag_1_ge = "diesen 'Geschossen': "
183
- Baubranchen_Disziplinen = ['A-Architektur', 'B-Bauphysik', 'C-Rohrpostanlagen', 'D-Datennetz', 'E-Elektroanlagen',
184
- 'F-Fassadenplanung', 'G-Küche', 'H-Heizung', 'I-Innenausbau', 'K-Kälte', 'L-Lüftung',
185
- 'M-Medizintechnik', 'N-Fördertechnik', 'O-Gebäudebetrieb', 'P-Sprinkler',
186
- 'Q-Brandschutz', 'R-Koordination', 'S-Sanitär', 'T-Tragwerksplanung', 'W-Informatik',
187
- 'Z-Lichtplanung']
188
- auftrag_2 = "gib nur den am besten passendsten Eintrag zurück. " \
189
- "Keine weiteren Ausführungen oder Erklärungen. " \
190
- "Antworte am besten in einem Wort. " \
191
- "Hier der Dokumenteninhalt: "
192
- Dokumententypen = ['Fotodokumentation', 'Projektdokumentation (PD)', 'Objektdokumentation (OD)',
193
- 'Prozessdokumentation', 'Fachdokumentation', 'Anlagedokumentation']
194
- ASH_Geschosse = ['U4', 'U3', 'U2', 'U1',
195
- 'A', 'B', 'C', 'D', 'E', 'F', 'G']
196
- #print(str(Baubranchen_Disziplinen))
197
- main()
 
1
+ """the idea is to embed all KBOB categories as vectores.
2
+ then when a new document in added, we do a sim search with the doc vector in the KBOB vectores
3
+ to map/classify. can be done in multiple steps. """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ask_app.py ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ complete, functional RAG App
3
+ stores vectors in session state, or locally.
4
+ add function to display retrieved documents
5
+ """
6
+
7
+ # import time
8
+ from datetime import datetime
9
+ # import openai
10
+ # import tiktoken
11
+ import streamlit as st
12
+ from PyPDF2 import PdfReader
13
+ from langchain.text_splitter import CharacterTextSplitter
14
+ from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
15
+ from langchain.vectorstores import FAISS
16
+ from langchain.chat_models import ChatOpenAI
17
+ from langchain.memory import ConversationBufferMemory
18
+ from langchain.chains import ConversationalRetrievalChain
19
+ from html_templates import css, bot_template, user_template
20
+ from langchain.llms import HuggingFaceHub
21
+ import os
22
+ import numpy as np
23
+ import faiss_utils
24
+ from langchain_community.vectorstores import FAISS
25
+ from langchain.embeddings import OpenAIEmbeddings
26
+
27
+
28
+ def merge_faiss_indices(index1, index2):
29
+ """
30
+ Merge two FAISS indices into a new index, assuming both are of the same type and dimensionality.
31
+
32
+ Args:
33
+ index1 (faiss.Index): The first FAISS index.
34
+ index2 (faiss.Index): The second FAISS index.
35
+
36
+ Returns:
37
+ faiss.Index: A new FAISS index containing all vectors from index1 and index2.
38
+ """
39
+
40
+ # Check if both indices are the same type
41
+ if type(index1) != type(index2):
42
+ raise ValueError("Indices are of different types")
43
+
44
+ # Check dimensionality
45
+ if index1.d != index2.d:
46
+ raise ValueError("Indices have different dimensionality")
47
+
48
+ # Determine type of indices
49
+ if isinstance(index1, FAISS.IndexFlatL2):
50
+ # Handle simple flat indices
51
+ d = index1.d
52
+ # Extract vectors from both indices
53
+ xb1 = FAISS.rev_swig_ptr(index1.xb.data(), index1.ntotal * d)
54
+ xb2 = FAISS.rev_swig_ptr(index2.xb.data(), index2.ntotal * d)
55
+
56
+ # Combine vectors
57
+ xb_combined = np.vstack((xb1, xb2))
58
+
59
+ # Create a new index and add combined vectors
60
+ new_index = FAISS.IndexFlatL2(d)
61
+ new_index.add(xb_combined)
62
+ return new_index
63
+
64
+ elif isinstance(index1, FAISS.IndexIVFFlat):
65
+ # Handle quantized indices (IndexIVFFlat)
66
+ d = index1.d
67
+ nlist = index1.nlist
68
+ quantizer = FAISS.IndexFlatL2(d) # Re-create the appropriate quantizer
69
+
70
+ # Create a new index with the same configuration
71
+ new_index = FAISS.IndexIVFFlat(quantizer, d, nlist, FAISS.METRIC_L2)
72
+
73
+ # If the indices are already trained, you can directly add the vectors
74
+ # Otherwise, you may need to train new_index using a representative subset of vectors
75
+ vecs1 = FAISS.rev_swig_ptr(index1.xb.data(), index1.ntotal * d)
76
+ vecs2 = FAISS.rev_swig_ptr(index2.xb.data(), index2.ntotal * d)
77
+ new_index.add(vecs1)
78
+ new_index.add(vecs2)
79
+ return new_index
80
+
81
+ else:
82
+ raise TypeError("Index type not supported for merging in this function")
83
+
84
+
85
+ def get_pdf_text(pdf_docs):
86
+ text = ""
87
+ for pdf in pdf_docs:
88
+ pdf_reader = PdfReader(pdf)
89
+ for page in pdf_reader.pages:
90
+ text += page.extract_text()
91
+ return text
92
+
93
+
94
+ def get_text_chunks(text):
95
+ text_splitter = CharacterTextSplitter(
96
+ separator="\n",
97
+ chunk_size=1000,
98
+ chunk_overlap=200,
99
+ length_function=len
100
+ )
101
+ chunks = text_splitter.split_text(text)
102
+ return chunks
103
+
104
+
105
+ def get_faiss_vectorstore(text_chunks):
106
+ if sst.openai:
107
+ my_embeddings = OpenAIEmbeddings()
108
+ else:
109
+ my_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
110
+ vectorstore = FAISS.from_texts(texts=text_chunks, embedding=my_embeddings)
111
+ return vectorstore
112
+
113
+
114
+ def get_conversation_chain(vectorstore):
115
+ if sst.openai:
116
+ llm = ChatOpenAI()
117
+ else:
118
+ llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature": 0.5, "max_length": 512})
119
+
120
+ memory = ConversationBufferMemory(
121
+ memory_key='chat_history', return_messages=True)
122
+ conversation_chain = ConversationalRetrievalChain.from_llm(
123
+ llm=llm,
124
+ retriever=vectorstore.as_retriever(),
125
+ memory=memory
126
+ )
127
+ return conversation_chain
128
+
129
+
130
+ def handle_userinput(user_question):
131
+ response = sst.conversation({'question': user_question})
132
+ sst.chat_history = response['chat_history']
133
+
134
+ for i, message in enumerate(sst.chat_history):
135
+ # Display user message
136
+ if i % 2 == 0:
137
+ st.write(user_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
138
+ else:
139
+ print(message)
140
+ # Display AI response
141
+ st.write(bot_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
142
+ # Display source document information if available in the message
143
+ if hasattr(message, 'source') and message.source:
144
+ st.write(f"Source Document: {message.source}", unsafe_allow_html=True)
145
+
146
+
147
+ if True:
148
+ BASE_URL = "https://api.vectara.io/v1"
149
+ OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
150
+ OPENAI_ORG_ID = os.environ["OPENAI_ORG_ID"]
151
+ PINECONE_API_KEY = os.environ["PINECONE_API_KEY_LCBIM"]
152
+ HUGGINGFACEHUB_API_TOKEN = os.environ["HUGGINGFACEHUB_API_TOKEN"]
153
+ VECTARA_API_KEY = os.environ["VECTARA_API_KEY"]
154
+ VECTARA_CUSTOMER_ID = os.environ["VECTARA_CUSTOMER_ID"]
155
+ headers = {"Authorization": f"Bearer {VECTARA_API_KEY}", "Content-Type": "application/json"}
156
+
157
+
158
+ def main():
159
+ st.set_page_config(page_title="Anna Seiler Haus KI-Assistent", page_icon=":hospital:")
160
+ st.write(css, unsafe_allow_html=True)
161
+ if "conversation" not in sst:
162
+ sst.conversation = None
163
+ if "chat_history" not in sst:
164
+ sst.chat_history = None
165
+ if "page" not in sst:
166
+ sst.page = "home"
167
+ if "openai" not in sst:
168
+ sst.openai = True
169
+ if "login" not in sst:
170
+ sst.login = False
171
+ if 'submitted_user_query' not in sst:
172
+ sst.submitted_user_query = ''
173
+ if 'submitted_user_safe' not in sst:
174
+ sst.submitted_user_safe = ''
175
+ if 'submitted_user_load' not in sst:
176
+ sst.submitted_user_load = ''
177
+
178
+ def submit_user_query():
179
+ sst.submitted_user_query = sst.widget_user_query
180
+ sst.widget_user_query = ''
181
+
182
+ def submit_user_safe():
183
+ sst.submitted_user_safe = sst.widget_user_safe
184
+ sst.widget_user_safe = ''
185
+ if "vectorstore" in sst:
186
+ # faiss_name = str(datetime.now().strftime("%Y%m%d%H%M%S")) + "faiss_index"
187
+ faiss_utils.save_local(sst.vectorstore, path=sst.submitted_user_safe)
188
+ st.sidebar.success("saved")
189
+ else:
190
+ st.sidebar.warning("No embeddings to save. Please process documents first.")
191
+
192
+ def submit_user_load():
193
+ sst.submitted_user_load = sst.widget_user_load
194
+ sst.widget_user_load = ''
195
+ if os.path.exists(sst.submitted_user_load):
196
+ new_db = faiss_utils.load_vectorstore(f"{sst.submitted_user_load}/faiss_index.index")
197
+ if "vectorstore" in sst:
198
+ if new_db is not None: # Check if this is working
199
+ sst.vectorstore.merge_from(new_db)
200
+ sst.conversation = get_conversation_chain(sst.vectorstore)
201
+ st.sidebar.success("faiss loaded")
202
+ else:
203
+ if new_db is not None: # Check if this is working
204
+ sst.vectorstore = new_db
205
+ sst.conversation = get_conversation_chain(new_db)
206
+ st.sidebar.success("faiss loaded")
207
+ else:
208
+ st.sidebar.warning("Couldn't load/find embeddings")
209
+
210
+ st.header("Anna Seiler Haus KI-Assistent ASH :hospital:")
211
+ if st.text_input("ASK_ASH_PASSWORD: ", type="password") == ASK_ASH_PASSWORD:
212
+
213
+ #user_question = st.text_input("Ask a question about your documents:", key="user_query", on_change=handle_query)
214
+ st.text_input('Ask a question about your documents:', key='widget_user_query', on_change=submit_user_query)
215
+ #sst.openai = st.toggle(label="use openai?")
216
+
217
+ if sst.submitted_user_query:
218
+ if "vectorstore" in sst:
219
+ handle_userinput(sst.submitted_user_query)
220
+ else:
221
+ st.warning("no vectorstore loaded.")
222
+
223
+ with st.sidebar:
224
+ st.subheader("Your documents")
225
+ pdf_docs = st.file_uploader("Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
226
+ if st.button("Process"):
227
+ with st.spinner("Processing"):
228
+ vec = get_faiss_vectorstore(get_text_chunks(get_pdf_text(pdf_docs)))
229
+ sst.vectorstore = vec
230
+ sst.conversation = get_conversation_chain(vec)
231
+ st.success("embedding complete")
232
+
233
+ st.text_input('Safe Embeddings to: (copy path of folder)', key='widget_user_safe',
234
+ on_change=submit_user_safe)
235
+
236
+ st.text_input('Load Embeddings from: (copy path of folder)', key='widget_user_load',
237
+ on_change=submit_user_load)
238
+
239
+
240
+ if __name__ == '__main__':
241
+ sst = st.session_state
242
+ ASK_ASH_PASSWORD = os.environ["ASK_ASH_PASSWORD"]
243
+ main()
classify_app.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ # import openai
4
+ from PyPDF2 import PdfReader
5
+ from openai import OpenAI
6
+ from langchain.chat_models import ChatOpenAI
7
+
8
+ ASK_ASH_PASSWORD = os.environ["ASK_ASH_PASSWORD"]
9
+
10
+
11
+ def gpt4_new(prompt_text):
12
+ client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))
13
+ response = client.chat.completions.create(
14
+ model="gpt-4",
15
+ messages=[{"role": "system",
16
+ "content": "Du bist eine Maschine, auf Grund des Texts von PDF-Dokumenten,"
17
+ "das Dokument in vorgegebene Kategorien klassifiziert."
18
+ "Du gibts möglichst kurze Antworten, am besten ein Wort"
19
+ "Du gibst keine Erklärungen oder Begründungen. "
20
+ "Du klassifizierst nur nach den vorgegebenen Kategorien."
21
+ "Wenn ein Dokument partout nicht klassifizierbar ist, "
22
+ "antwortest du mit '<no classification>'"},
23
+ {"role": "user", "content": prompt_text}])
24
+ return response.choices[0].message.content
25
+
26
+
27
+ # Define a function to ask a question to GPT-4
28
+ def ask_gpt4(question):
29
+ print(question) # we don't have to submit the question?
30
+ try:
31
+ # Use the chat function to send a message and get a response
32
+ response = ChatOpenAI()
33
+ # Extract the response text
34
+ return response["choices"][0]["message"]["content"]
35
+ except Exception as e:
36
+ # Handle exceptions that may occur during the API call
37
+ return str(e)
38
+
39
+
40
+ def process_prompts_and_save(my_prompts):
41
+ # Ensure the responses list is empty initially
42
+ responses = []
43
+
44
+ # Loop through each prompt in the list
45
+ for prompt in my_prompts:
46
+ try:
47
+ # ADD LOGIC TO READ FILE AND CLASSIFY
48
+ # Generate response for each prompt and append to the list
49
+ response = ask_gpt4(prompt)
50
+ sol = f"{prompt}\n\n{response}\n\n\n\n"
51
+ print(sol)
52
+ responses.append(sol)
53
+ except Exception as e:
54
+ # In case of an error, log the error with the prompt
55
+ responses.append(f"{prompt}\n\nError:{str(e)}\n\n\n\n")
56
+
57
+ # Writing all responses to a text file
58
+ with open('gpt4_responses.txt', 'w', encoding='utf-8') as file:
59
+ file.writelines(responses)
60
+
61
+
62
+ def get_pdfs_text(pdf_docs):
63
+ text = ""
64
+ for pdf in pdf_docs:
65
+ pdf_reader = PdfReader(pdf)
66
+ for page in pdf_reader.pages:
67
+ text += page.extract_text()
68
+ return text
69
+
70
+
71
+ def get_pdf_text(pdf_document):
72
+ text = ""
73
+ pdf_reader = PdfReader(pdf_document)
74
+ for page in pdf_reader.pages:
75
+ text += page.extract_text()
76
+ return text
77
+
78
+
79
+ def json_open(filename):
80
+ with open(filename, "r") as f:
81
+ mydata = f.read()
82
+ return mydata
83
+
84
+
85
+ def main():
86
+ st.title("Doc Classifier")
87
+ l, r = st.columns(2)
88
+ if st.toggle("show README"):
89
+ st.subheader("Funktion: ")
90
+ st.write("der Doc Classifier von Elia Wäfler kann einige der BIM2FM Dokumente")
91
+ st.write("des ASH nach Disziplin, Doc typ. und Geschoss (später KBOB) klassifizieren.")
92
+ st.write("lade ein oder mehrere PDF-Dokumente hoch, um es auszuprobieren.")
93
+ st.write("Feedback und Bugs gerne an elia.waefler@insel.ch")
94
+ st.write("Vielen Dank.")
95
+ st.write("")
96
+ with l:
97
+ st.subheader("Limitationen: ")
98
+ st.write("bisher nur PDFs")
99
+ st.write("nur Disziplin, Doc typ. und Geschoss")
100
+ st.write("macht teilweise Fehler, vor allem bei Koordination, Datennetz usw, (unklare Disziplinen)")
101
+ st.write("")
102
+ with r:
103
+ st.subheader("geplante Erweiterungen:")
104
+ st.write("Text Beschreibung wird von AI hinzugefügt")
105
+ st.write("jpg, bilder, tabellen, .xlsx, .docx alles möglich, nicht nur PDF/Text")
106
+ st.write("Ecodomus API einbinden, um alle Dokumente zu überprüfen.")
107
+
108
+ if st.text_input("ASK_ASH_PASSWORD: ", type="password") == ASK_ASH_PASSWORD:
109
+ uploaded_files = st.file_uploader("PDF Dokument", accept_multiple_files=True)
110
+ #print(uploaded_file)
111
+ #print(uploaded_file.name)
112
+
113
+ if st.button("classify KBOB!"):
114
+ if uploaded_files is not None:
115
+ with st.container():
116
+ # col1, col2, col3, col4, col5 = st.columns(5)
117
+ col1, col2, col3 = st.columns(3)
118
+ all_metadata = []
119
+ with col1:
120
+ st.write("Disziplin")
121
+ st.write(f"")
122
+ with col2:
123
+ st.write("Dokumententyp")
124
+ st.write(f"")
125
+ with col3:
126
+ st.write("Geschoss")
127
+ st.write(f"")
128
+
129
+ for file in uploaded_files:
130
+ metadata = [file.name]
131
+ with col1:
132
+ with st.spinner("GPT4 at work"):
133
+ pdf_text = str(get_pdf_text(file))
134
+ prompt_1 = auftrag_0 + auftrag_1_disziplin + str(Baubranchen_Disziplinen) + pdf_text
135
+ answer_1 = gpt4_new(prompt_1)
136
+ print(prompt_1)
137
+ metadata.append(answer_1)
138
+ st.write(answer_1)
139
+
140
+ with col2:
141
+ with st.spinner("GPT4 at work"):
142
+ prompt_2 = auftrag_0 + auftrag_1_type + str(Dokumententypen) + pdf_text
143
+ answer_2 = gpt4_new(prompt_2)
144
+ print(prompt_2)
145
+ metadata.append(answer_2)
146
+
147
+ st.write(answer_2)
148
+
149
+ with col3:
150
+ with st.spinner("GPT4 at work"):
151
+ prompt_3 = auftrag_0 + auftrag_1_ge + str(ASH_Geschosse) + pdf_text
152
+ answer_3 = gpt4_new(prompt_3)
153
+ print(prompt_3)
154
+ metadata.append(answer_2)
155
+
156
+ st.write(answer_3)
157
+
158
+ all_metadata.append(metadata)
159
+
160
+ metadata_filename = "ai_generated_metadata.txt"
161
+ with open(metadata_filename, 'w', encoding='utf-8') as f:
162
+ for line in all_metadata:
163
+ f.writelines("\n")
164
+ for item in line:
165
+ f.writelines(item)
166
+ f.writelines(";")
167
+
168
+ f.writelines("\n")
169
+
170
+ st.success("classified, saved")
171
+ st.download_button(f"Download Metadata", json_open(metadata_filename), file_name=metadata_filename)
172
+ else:
173
+ st.warning("no file")
174
+
175
+
176
+ if __name__ == "__main__":
177
+ #prompts = ["classify the document, tell me the ", "hello"]
178
+ #process_prompts_and_save(prompts)
179
+ auftrag_0 = "Klassifiziere dieses Dokument nach "
180
+ auftrag_1_disziplin = "diesen 'Baubranchen Disziplinen': "
181
+ auftrag_1_type = "diesen 'Dokumententypen': "
182
+ auftrag_1_ge = "diesen 'Geschossen': "
183
+ Baubranchen_Disziplinen = ['A-Architektur', 'B-Bauphysik', 'C-Rohrpostanlagen', 'D-Datennetz', 'E-Elektroanlagen',
184
+ 'F-Fassadenplanung', 'G-Küche', 'H-Heizung', 'I-Innenausbau', 'K-Kälte', 'L-Lüftung',
185
+ 'M-Medizintechnik', 'N-Fördertechnik', 'O-Gebäudebetrieb', 'P-Sprinkler',
186
+ 'Q-Brandschutz', 'R-Koordination', 'S-Sanitär', 'T-Tragwerksplanung', 'W-Informatik',
187
+ 'Z-Lichtplanung']
188
+ auftrag_2 = "gib nur den am besten passendsten Eintrag zurück. " \
189
+ "Keine weiteren Ausführungen oder Erklärungen. " \
190
+ "Antworte am besten in einem Wort. " \
191
+ "Hier der Dokumenteninhalt: "
192
+ Dokumententypen = ['Fotodokumentation', 'Projektdokumentation (PD)', 'Objektdokumentation (OD)',
193
+ 'Prozessdokumentation', 'Fachdokumentation', 'Anlagedokumentation']
194
+ ASH_Geschosse = ['U4', 'U3', 'U2', 'U1',
195
+ 'A', 'B', 'C', 'D', 'E', 'F', 'G']
196
+ #print(str(Baubranchen_Disziplinen))
197
+ main()
faiss_utils.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from langchain_community.vectorstores import FAISS
3
+ from langchain.embeddings import OpenAIEmbeddings
4
+
5
+
6
+ def embed(input_strings):
7
+ vectorstore = FAISS.from_texts(texts=input_strings, embedding=OpenAIEmbeddings())
8
+ return vectorstore
9
+
10
+
11
+ # Function to save a FAISS vectorstore to a specified path
12
+ def save_local(vectorstore, path="safe/"):
13
+ if not os.path.exists(path):
14
+ os.makedirs(path)
15
+ file_path = os.path.join(path, "faiss_index.index")
16
+ vectorstore.save_local(file_path)
17
+ print(f"FAISS vectorstore saved to {file_path}")
18
+
19
+
20
+ # Function to load a FAISS vectorstore from a specified path
21
+ def load_vectorstore(path):
22
+ embeddings = OpenAIEmbeddings() # Needed to initialize the FAISS properly
23
+ vectorstore = FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True)
24
+ print(f"FAISS vectorstore loaded from {path}")
25
+ return vectorstore
26
+
27
+
28
+ # Example usage
29
+ if __name__ == "__main__":
30
+ # Embed a few words
31
+ words = ["hello", "world", "sample", "text"]
32
+ faiss_db1 = embed(words)
33
+
34
+ # Save the vectorstore
35
+ save_local(faiss_db1)
36
+
37
+ # Load the vectorstore
38
+ loaded_db1 = load_vectorstore("safe/faiss_index.index")
39
+
40
+ # Embed another set of words and create a second vectorstore
41
+ more_words = ["FAISS", "database", "information", "retrieval"]
42
+ faiss_db2 = embed(more_words)
43
+
44
+ loaded_db1.merge_from(faiss_db2)
45
+ print("Merged vectorstore with other vectorstore containing total vectors:", loaded_db1.index.ntotal)