Spaces:

elia-waefler
/

reverse-RAG

No application file

App Files Files Community

elia-waefler commited on May 1, 2024

Commit

fcac63a

1 Parent(s): cff6e97

init files, idea

Browse files

Files changed (11) hide show

.idea/.gitignore +3 -0
.idea/inspectionProfiles/Project_Default.xml +21 -0
.idea/inspectionProfiles/profiles_settings.xml +6 -0
.idea/misc.xml +4 -0
.idea/modules.xml +8 -0
.idea/reverse-RAG.iml +14 -0
.idea/vcs.xml +6 -0
app.py +3 -197
ask_app.py +243 -0
classify_app.py +197 -0
faiss_utils.py +45 -0

.idea/.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+# Default ignored files
+/shelf/
+/workspace.xml

.idea/inspectionProfiles/Project_Default.xml ADDED Viewed

	@@ -0,0 +1,21 @@

+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="ignoredPackages">
+        <value>
+          <list size="1">
+            <item index="0" class="java.lang.String" itemvalue="faiss" />
+          </list>
+        </value>
+      </option>
+    </inspection_tool>
+    <inspection_tool class="PyPep8Inspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
+      <option name="ignoredErrors">
+        <list>
+          <option value="E265" />
+        </list>
+      </option>
+    </inspection_tool>
+  </profile>
+</component>

.idea/inspectionProfiles/profiles_settings.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>

.idea/misc.xml ADDED Viewed

	@@ -0,0 +1,4 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (reverse-RAG)" project-jdk-type="Python SDK" />
+</project>

.idea/modules.xml ADDED Viewed

	@@ -0,0 +1,8 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/reverse-RAG.iml" filepath="$PROJECT_DIR$/.idea/reverse-RAG.iml" />
+    </modules>
+  </component>
+</project>

.idea/reverse-RAG.iml ADDED Viewed

	@@ -0,0 +1,14 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$">
+      <excludeFolder url="file://$MODULE_DIR$/venv" />
+    </content>
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="PyDocumentationSettings">
+    <option name="format" value="GOOGLE" />
+    <option name="myDocStringFormat" value="Google" />
+  </component>
+</module>

.idea/vcs.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>

app.py CHANGED Viewed

@@ -1,197 +1,3 @@
-import streamlit as st
-import os
-# import openai
-from PyPDF2 import PdfReader
-from openai import OpenAI
-from langchain.chat_models import ChatOpenAI
-ASK_ASH_PASSWORD = os.environ["ASK_ASH_PASSWORD"]
-def gpt4_new(prompt_text):
-    client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))
-    response = client.chat.completions.create(
-        model="gpt-4",
-        messages=[{"role": "system",
-                   "content":   "Du bist eine Maschine, auf Grund des Texts von PDF-Dokumenten,"
-                                "das Dokument in vorgegebene Kategorien klassifiziert."
-                                "Du gibts möglichst kurze Antworten, am besten ein Wort"
-                                "Du gibst keine Erklärungen oder Begründungen. "
-                                "Du klassifizierst nur nach den vorgegebenen Kategorien."
-                                "Wenn ein Dokument partout nicht klassifizierbar ist, "
-                                "antwortest du mit '<no classification>'"},
-                  {"role": "user", "content": prompt_text}])
-    return response.choices[0].message.content
-# Define a function to ask a question to GPT-4
-def ask_gpt4(question):
-    print(question)  # we don't have to submit the question?
-    try:
-        # Use the chat function to send a message and get a response
-        response = ChatOpenAI()
-        # Extract the response text
-        return response["choices"][0]["message"]["content"]
-    except Exception as e:
-        # Handle exceptions that may occur during the API call
-        return str(e)
-def process_prompts_and_save(my_prompts):
-    # Ensure the responses list is empty initially
-    responses = []
-    # Loop through each prompt in the list
-    for prompt in my_prompts:
-        try:
-            # ADD LOGIC TO READ FILE AND CLASSIFY
-            # Generate response for each prompt and append to the list
-            response = ask_gpt4(prompt)
-            sol = f"{prompt}\n\n{response}\n\n\n\n"
-            print(sol)
-            responses.append(sol)
-        except Exception as e:
-            # In case of an error, log the error with the prompt
-            responses.append(f"{prompt}\n\nError:{str(e)}\n\n\n\n")
-    # Writing all responses to a text file
-    with open('gpt4_responses.txt', 'w', encoding='utf-8') as file:
-        file.writelines(responses)
-def get_pdfs_text(pdf_docs):
-    text = ""
-    for pdf in pdf_docs:
-        pdf_reader = PdfReader(pdf)
-        for page in pdf_reader.pages:
-            text += page.extract_text()
-    return text
-def get_pdf_text(pdf_document):
-    text = ""
-    pdf_reader = PdfReader(pdf_document)
-    for page in pdf_reader.pages:
-        text += page.extract_text()
-    return text
-def json_open(filename):
-    with open(filename, "r") as f:
-        mydata = f.read()
-    return mydata
-def main():
-    st.title("Doc Classifier")
-    l, r = st.columns(2)
-    if st.toggle("show README"):
-        st.subheader("Funktion: ")
-        st.write("der Doc Classifier von Elia Wäfler kann einige der BIM2FM Dokumente")
-        st.write("des ASH nach Disziplin, Doc typ. und Geschoss (später KBOB) klassifizieren.")
-        st.write("lade ein oder mehrere PDF-Dokumente hoch, um es auszuprobieren.")
-        st.write("Feedback und Bugs gerne an elia.waefler@insel.ch")
-        st.write("Vielen Dank.")
-        st.write("")
-        with l:
-            st.subheader("Limitationen: ")
-            st.write("bisher nur PDFs")
-            st.write("nur Disziplin, Doc typ. und Geschoss")
-            st.write("macht teilweise Fehler, vor allem bei Koordination, Datennetz usw, (unklare Disziplinen)")
-            st.write("")
-        with r:
-            st.subheader("geplante Erweiterungen:")
-            st.write("Text Beschreibung wird von AI hinzugefügt")
-            st.write("jpg, bilder, tabellen, .xlsx, .docx alles möglich, nicht nur PDF/Text")
-            st.write("Ecodomus API einbinden, um alle Dokumente zu überprüfen.")
-    if st.text_input("ASK_ASH_PASSWORD: ", type="password") == ASK_ASH_PASSWORD:
-        uploaded_files = st.file_uploader("PDF Dokument", accept_multiple_files=True)
-        #print(uploaded_file)
-        #print(uploaded_file.name)
-        if st.button("classify KBOB!"):
-            if uploaded_files is not None:
-                with st.container():
-                    # col1, col2, col3, col4, col5 = st.columns(5)
-                    col1, col2, col3 = st.columns(3)
-                    all_metadata = []
-                    with col1:
-                        st.write("Disziplin")
-                        st.write(f"")
-                    with col2:
-                        st.write("Dokumententyp")
-                        st.write(f"")
-                    with col3:
-                        st.write("Geschoss")
-                        st.write(f"")
-                    for file in uploaded_files:
-                        metadata = [file.name]
-                        with col1:
-                            with st.spinner("GPT4 at work"):
-                                pdf_text = str(get_pdf_text(file))
-                                prompt_1 = auftrag_0 + auftrag_1_disziplin + str(Baubranchen_Disziplinen) + pdf_text
-                                answer_1 = gpt4_new(prompt_1)
-                                print(prompt_1)
-                                metadata.append(answer_1)
-                            st.write(answer_1)
-                        with col2:
-                            with st.spinner("GPT4 at work"):
-                                prompt_2 = auftrag_0 + auftrag_1_type + str(Dokumententypen) + pdf_text
-                                answer_2 = gpt4_new(prompt_2)
-                                print(prompt_2)
-                                metadata.append(answer_2)
-                            st.write(answer_2)
-                        with col3:
-                            with st.spinner("GPT4 at work"):
-                                prompt_3 = auftrag_0 + auftrag_1_ge + str(ASH_Geschosse) + pdf_text
-                                answer_3 = gpt4_new(prompt_3)
-                                print(prompt_3)
-                                metadata.append(answer_2)
-                            st.write(answer_3)
-                        all_metadata.append(metadata)
-                    metadata_filename = "ai_generated_metadata.txt"
-                    with open(metadata_filename, 'w', encoding='utf-8') as f:
-                        for line in all_metadata:
-                            f.writelines("\n")
-                            for item in line:
-                                f.writelines(item)
-                                f.writelines(";")
-                            f.writelines("\n")
-                    st.success("classified, saved")
-                    st.download_button(f"Download Metadata", json_open(metadata_filename), file_name=metadata_filename)
-            else:
-                st.warning("no file")
-if __name__ == "__main__":
-    #prompts = ["classify the document, tell me the ", "hello"]
-    #process_prompts_and_save(prompts)
-    auftrag_0 = "Klassifiziere dieses Dokument nach "
-    auftrag_1_disziplin = "diesen 'Baubranchen Disziplinen': "
-    auftrag_1_type = "diesen 'Dokumententypen': "
-    auftrag_1_ge = "diesen 'Geschossen': "
-    Baubranchen_Disziplinen = ['A-Architektur', 'B-Bauphysik', 'C-Rohrpostanlagen', 'D-Datennetz', 'E-Elektroanlagen',
-                               'F-Fassadenplanung', 'G-Küche', 'H-Heizung', 'I-Innenausbau', 'K-Kälte', 'L-Lüftung',
-                               'M-Medizintechnik', 'N-Fördertechnik', 'O-Gebäudebetrieb', 'P-Sprinkler',
-                               'Q-Brandschutz', 'R-Koordination', 'S-Sanitär', 'T-Tragwerksplanung', 'W-Informatik',
-                               'Z-Lichtplanung']
-    auftrag_2 = "gib nur den am besten passendsten Eintrag zurück. " \
-                "Keine weiteren Ausführungen oder Erklärungen. " \
-                "Antworte am besten in einem Wort. " \
-                "Hier der Dokumenteninhalt: "
-    Dokumententypen = ['Fotodokumentation', 'Projektdokumentation (PD)', 'Objektdokumentation (OD)',
-                       'Prozessdokumentation',  'Fachdokumentation', 'Anlagedokumentation']
-    ASH_Geschosse = ['U4', 'U3', 'U2', 'U1',
-                     'A', 'B', 'C', 'D', 'E', 'F', 'G']
-    #print(str(Baubranchen_Disziplinen))
-    main()

+"""the idea is to embed all KBOB categories as vectores.
+then when a new document in added, we do a sim search with the doc vector in the KBOB vectores
+to map/classify. can be done in multiple steps. """

ask_app.py ADDED Viewed

	@@ -0,0 +1,243 @@

+"""
+complete, functional RAG App
+stores vectors in session state, or locally.
+add function to display retrieved documents
+"""
+# import time
+from datetime import datetime
+# import openai
+# import tiktoken
+import streamlit as st
+from PyPDF2 import PdfReader
+from langchain.text_splitter import CharacterTextSplitter
+from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
+from langchain.vectorstores import FAISS
+from langchain.chat_models import ChatOpenAI
+from langchain.memory import ConversationBufferMemory
+from langchain.chains import ConversationalRetrievalChain
+from html_templates import css, bot_template, user_template
+from langchain.llms import HuggingFaceHub
+import os
+import numpy as np
+import faiss_utils
+from langchain_community.vectorstores import FAISS
+from langchain.embeddings import OpenAIEmbeddings
+def merge_faiss_indices(index1, index2):
+    """
+    Merge two FAISS indices into a new index, assuming both are of the same type and dimensionality.
+    Args:
+    index1 (faiss.Index): The first FAISS index.
+    index2 (faiss.Index): The second FAISS index.
+    Returns:
+    faiss.Index: A new FAISS index containing all vectors from index1 and index2.
+    """
+    # Check if both indices are the same type
+    if type(index1) != type(index2):
+        raise ValueError("Indices are of different types")
+    # Check dimensionality
+    if index1.d != index2.d:
+        raise ValueError("Indices have different dimensionality")
+    # Determine type of indices
+    if isinstance(index1, FAISS.IndexFlatL2):
+        # Handle simple flat indices
+        d = index1.d
+        # Extract vectors from both indices
+        xb1 = FAISS.rev_swig_ptr(index1.xb.data(), index1.ntotal * d)
+        xb2 = FAISS.rev_swig_ptr(index2.xb.data(), index2.ntotal * d)
+        # Combine vectors
+        xb_combined = np.vstack((xb1, xb2))
+        # Create a new index and add combined vectors
+        new_index = FAISS.IndexFlatL2(d)
+        new_index.add(xb_combined)
+        return new_index
+    elif isinstance(index1, FAISS.IndexIVFFlat):
+        # Handle quantized indices (IndexIVFFlat)
+        d = index1.d
+        nlist = index1.nlist
+        quantizer = FAISS.IndexFlatL2(d)  # Re-create the appropriate quantizer
+        # Create a new index with the same configuration
+        new_index = FAISS.IndexIVFFlat(quantizer, d, nlist, FAISS.METRIC_L2)
+        # If the indices are already trained, you can directly add the vectors
+        # Otherwise, you may need to train new_index using a representative subset of vectors
+        vecs1 = FAISS.rev_swig_ptr(index1.xb.data(), index1.ntotal * d)
+        vecs2 = FAISS.rev_swig_ptr(index2.xb.data(), index2.ntotal * d)
+        new_index.add(vecs1)
+        new_index.add(vecs2)
+        return new_index
+    else:
+        raise TypeError("Index type not supported for merging in this function")
+def get_pdf_text(pdf_docs):
+    text = ""
+    for pdf in pdf_docs:
+        pdf_reader = PdfReader(pdf)
+        for page in pdf_reader.pages:
+            text += page.extract_text()
+    return text
+def get_text_chunks(text):
+    text_splitter = CharacterTextSplitter(
+        separator="\n",
+        chunk_size=1000,
+        chunk_overlap=200,
+        length_function=len
+    )
+    chunks = text_splitter.split_text(text)
+    return chunks
+def get_faiss_vectorstore(text_chunks):
+    if sst.openai:
+        my_embeddings = OpenAIEmbeddings()
+    else:
+        my_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
+    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=my_embeddings)
+    return vectorstore
+def get_conversation_chain(vectorstore):
+    if sst.openai:
+        llm = ChatOpenAI()
+    else:
+        llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature": 0.5, "max_length": 512})
+    memory = ConversationBufferMemory(
+        memory_key='chat_history', return_messages=True)
+    conversation_chain = ConversationalRetrievalChain.from_llm(
+        llm=llm,
+        retriever=vectorstore.as_retriever(),
+        memory=memory
+    )
+    return conversation_chain
+def handle_userinput(user_question):
+    response = sst.conversation({'question': user_question})
+    sst.chat_history = response['chat_history']
+    for i, message in enumerate(sst.chat_history):
+        # Display user message
+        if i % 2 == 0:
+            st.write(user_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
+        else:
+            print(message)
+            # Display AI response
+            st.write(bot_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
+            # Display source document information if available in the message
+            if hasattr(message, 'source') and message.source:
+                st.write(f"Source Document: {message.source}", unsafe_allow_html=True)
+if True:
+    BASE_URL = "https://api.vectara.io/v1"
+    OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
+    OPENAI_ORG_ID = os.environ["OPENAI_ORG_ID"]
+    PINECONE_API_KEY = os.environ["PINECONE_API_KEY_LCBIM"]
+    HUGGINGFACEHUB_API_TOKEN = os.environ["HUGGINGFACEHUB_API_TOKEN"]
+    VECTARA_API_KEY = os.environ["VECTARA_API_KEY"]
+    VECTARA_CUSTOMER_ID = os.environ["VECTARA_CUSTOMER_ID"]
+    headers = {"Authorization": f"Bearer {VECTARA_API_KEY}", "Content-Type": "application/json"}
+def main():
+    st.set_page_config(page_title="Anna Seiler Haus KI-Assistent", page_icon=":hospital:")
+    st.write(css, unsafe_allow_html=True)
+    if "conversation" not in sst:
+        sst.conversation = None
+    if "chat_history" not in sst:
+        sst.chat_history = None
+    if "page" not in sst:
+        sst.page = "home"
+    if "openai" not in sst:
+        sst.openai = True
+    if "login" not in sst:
+        sst.login = False
+    if 'submitted_user_query' not in sst:
+        sst.submitted_user_query = ''
+    if 'submitted_user_safe' not in sst:
+        sst.submitted_user_safe = ''
+    if 'submitted_user_load' not in sst:
+        sst.submitted_user_load = ''
+    def submit_user_query():
+        sst.submitted_user_query = sst.widget_user_query
+        sst.widget_user_query = ''
+    def submit_user_safe():
+        sst.submitted_user_safe = sst.widget_user_safe
+        sst.widget_user_safe = ''
+        if "vectorstore" in sst:
+            # faiss_name = str(datetime.now().strftime("%Y%m%d%H%M%S")) + "faiss_index"
+            faiss_utils.save_local(sst.vectorstore, path=sst.submitted_user_safe)
+            st.sidebar.success("saved")
+        else:
+            st.sidebar.warning("No embeddings to save. Please process documents first.")
+    def submit_user_load():
+        sst.submitted_user_load = sst.widget_user_load
+        sst.widget_user_load = ''
+        if os.path.exists(sst.submitted_user_load):
+            new_db = faiss_utils.load_vectorstore(f"{sst.submitted_user_load}/faiss_index.index")
+            if "vectorstore" in sst:
+                if new_db is not None:  # Check if this is working
+                    sst.vectorstore.merge_from(new_db)
+                    sst.conversation = get_conversation_chain(sst.vectorstore)
+                    st.sidebar.success("faiss loaded")
+            else:
+                if new_db is not None:  # Check if this is working
+                    sst.vectorstore = new_db
+                    sst.conversation = get_conversation_chain(new_db)
+                    st.sidebar.success("faiss loaded")
+        else:
+            st.sidebar.warning("Couldn't load/find embeddings")
+    st.header("Anna Seiler Haus KI-Assistent ASH :hospital:")
+    if st.text_input("ASK_ASH_PASSWORD: ", type="password") == ASK_ASH_PASSWORD:
+        #user_question = st.text_input("Ask a question about your documents:", key="user_query", on_change=handle_query)
+        st.text_input('Ask a question about your documents:', key='widget_user_query', on_change=submit_user_query)
+        #sst.openai = st.toggle(label="use openai?")
+        if sst.submitted_user_query:
+            if "vectorstore" in sst:
+                handle_userinput(sst.submitted_user_query)
+            else:
+                st.warning("no vectorstore loaded.")
+        with st.sidebar:
+            st.subheader("Your documents")
+            pdf_docs = st.file_uploader("Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
+            if st.button("Process"):
+                with st.spinner("Processing"):
+                    vec = get_faiss_vectorstore(get_text_chunks(get_pdf_text(pdf_docs)))
+                    sst.vectorstore = vec
+                    sst.conversation = get_conversation_chain(vec)
+                st.success("embedding complete")
+            st.text_input('Safe Embeddings to: (copy path of folder)', key='widget_user_safe',
+                          on_change=submit_user_safe)
+            st.text_input('Load Embeddings from: (copy path of folder)', key='widget_user_load',
+                          on_change=submit_user_load)
+if __name__ == '__main__':
+    sst = st.session_state
+    ASK_ASH_PASSWORD = os.environ["ASK_ASH_PASSWORD"]
+    main()

classify_app.py ADDED Viewed

	@@ -0,0 +1,197 @@

+import streamlit as st
+import os
+# import openai
+from PyPDF2 import PdfReader
+from openai import OpenAI
+from langchain.chat_models import ChatOpenAI
+ASK_ASH_PASSWORD = os.environ["ASK_ASH_PASSWORD"]
+def gpt4_new(prompt_text):
+    client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))
+    response = client.chat.completions.create(
+        model="gpt-4",
+        messages=[{"role": "system",
+                   "content":   "Du bist eine Maschine, auf Grund des Texts von PDF-Dokumenten,"
+                                "das Dokument in vorgegebene Kategorien klassifiziert."
+                                "Du gibts möglichst kurze Antworten, am besten ein Wort"
+                                "Du gibst keine Erklärungen oder Begründungen. "
+                                "Du klassifizierst nur nach den vorgegebenen Kategorien."
+                                "Wenn ein Dokument partout nicht klassifizierbar ist, "
+                                "antwortest du mit '<no classification>'"},
+                  {"role": "user", "content": prompt_text}])
+    return response.choices[0].message.content
+# Define a function to ask a question to GPT-4
+def ask_gpt4(question):
+    print(question)  # we don't have to submit the question?
+    try:
+        # Use the chat function to send a message and get a response
+        response = ChatOpenAI()
+        # Extract the response text
+        return response["choices"][0]["message"]["content"]
+    except Exception as e:
+        # Handle exceptions that may occur during the API call
+        return str(e)
+def process_prompts_and_save(my_prompts):
+    # Ensure the responses list is empty initially
+    responses = []
+    # Loop through each prompt in the list
+    for prompt in my_prompts:
+        try:
+            # ADD LOGIC TO READ FILE AND CLASSIFY
+            # Generate response for each prompt and append to the list
+            response = ask_gpt4(prompt)
+            sol = f"{prompt}\n\n{response}\n\n\n\n"
+            print(sol)
+            responses.append(sol)
+        except Exception as e:
+            # In case of an error, log the error with the prompt
+            responses.append(f"{prompt}\n\nError:{str(e)}\n\n\n\n")
+    # Writing all responses to a text file
+    with open('gpt4_responses.txt', 'w', encoding='utf-8') as file:
+        file.writelines(responses)
+def get_pdfs_text(pdf_docs):
+    text = ""
+    for pdf in pdf_docs:
+        pdf_reader = PdfReader(pdf)
+        for page in pdf_reader.pages:
+            text += page.extract_text()
+    return text
+def get_pdf_text(pdf_document):
+    text = ""
+    pdf_reader = PdfReader(pdf_document)
+    for page in pdf_reader.pages:
+        text += page.extract_text()
+    return text
+def json_open(filename):
+    with open(filename, "r") as f:
+        mydata = f.read()
+    return mydata
+def main():
+    st.title("Doc Classifier")
+    l, r = st.columns(2)
+    if st.toggle("show README"):
+        st.subheader("Funktion: ")
+        st.write("der Doc Classifier von Elia Wäfler kann einige der BIM2FM Dokumente")
+        st.write("des ASH nach Disziplin, Doc typ. und Geschoss (später KBOB) klassifizieren.")
+        st.write("lade ein oder mehrere PDF-Dokumente hoch, um es auszuprobieren.")
+        st.write("Feedback und Bugs gerne an elia.waefler@insel.ch")
+        st.write("Vielen Dank.")
+        st.write("")
+        with l:
+            st.subheader("Limitationen: ")
+            st.write("bisher nur PDFs")
+            st.write("nur Disziplin, Doc typ. und Geschoss")
+            st.write("macht teilweise Fehler, vor allem bei Koordination, Datennetz usw, (unklare Disziplinen)")
+            st.write("")
+        with r:
+            st.subheader("geplante Erweiterungen:")
+            st.write("Text Beschreibung wird von AI hinzugefügt")
+            st.write("jpg, bilder, tabellen, .xlsx, .docx alles möglich, nicht nur PDF/Text")
+            st.write("Ecodomus API einbinden, um alle Dokumente zu überprüfen.")
+    if st.text_input("ASK_ASH_PASSWORD: ", type="password") == ASK_ASH_PASSWORD:
+        uploaded_files = st.file_uploader("PDF Dokument", accept_multiple_files=True)
+        #print(uploaded_file)
+        #print(uploaded_file.name)
+        if st.button("classify KBOB!"):
+            if uploaded_files is not None:
+                with st.container():
+                    # col1, col2, col3, col4, col5 = st.columns(5)
+                    col1, col2, col3 = st.columns(3)
+                    all_metadata = []
+                    with col1:
+                        st.write("Disziplin")
+                        st.write(f"")
+                    with col2:
+                        st.write("Dokumententyp")
+                        st.write(f"")
+                    with col3:
+                        st.write("Geschoss")
+                        st.write(f"")
+                    for file in uploaded_files:
+                        metadata = [file.name]
+                        with col1:
+                            with st.spinner("GPT4 at work"):
+                                pdf_text = str(get_pdf_text(file))
+                                prompt_1 = auftrag_0 + auftrag_1_disziplin + str(Baubranchen_Disziplinen) + pdf_text
+                                answer_1 = gpt4_new(prompt_1)
+                                print(prompt_1)
+                                metadata.append(answer_1)
+                            st.write(answer_1)
+                        with col2:
+                            with st.spinner("GPT4 at work"):
+                                prompt_2 = auftrag_0 + auftrag_1_type + str(Dokumententypen) + pdf_text
+                                answer_2 = gpt4_new(prompt_2)
+                                print(prompt_2)
+                                metadata.append(answer_2)
+                            st.write(answer_2)
+                        with col3:
+                            with st.spinner("GPT4 at work"):
+                                prompt_3 = auftrag_0 + auftrag_1_ge + str(ASH_Geschosse) + pdf_text
+                                answer_3 = gpt4_new(prompt_3)
+                                print(prompt_3)
+                                metadata.append(answer_2)
+                            st.write(answer_3)
+                        all_metadata.append(metadata)
+                    metadata_filename = "ai_generated_metadata.txt"
+                    with open(metadata_filename, 'w', encoding='utf-8') as f:
+                        for line in all_metadata:
+                            f.writelines("\n")
+                            for item in line:
+                                f.writelines(item)
+                                f.writelines(";")
+                            f.writelines("\n")
+                    st.success("classified, saved")
+                    st.download_button(f"Download Metadata", json_open(metadata_filename), file_name=metadata_filename)
+            else:
+                st.warning("no file")
+if __name__ == "__main__":
+    #prompts = ["classify the document, tell me the ", "hello"]
+    #process_prompts_and_save(prompts)
+    auftrag_0 = "Klassifiziere dieses Dokument nach "
+    auftrag_1_disziplin = "diesen 'Baubranchen Disziplinen': "
+    auftrag_1_type = "diesen 'Dokumententypen': "
+    auftrag_1_ge = "diesen 'Geschossen': "
+    Baubranchen_Disziplinen = ['A-Architektur', 'B-Bauphysik', 'C-Rohrpostanlagen', 'D-Datennetz', 'E-Elektroanlagen',
+                               'F-Fassadenplanung', 'G-Küche', 'H-Heizung', 'I-Innenausbau', 'K-Kälte', 'L-Lüftung',
+                               'M-Medizintechnik', 'N-Fördertechnik', 'O-Gebäudebetrieb', 'P-Sprinkler',
+                               'Q-Brandschutz', 'R-Koordination', 'S-Sanitär', 'T-Tragwerksplanung', 'W-Informatik',
+                               'Z-Lichtplanung']
+    auftrag_2 = "gib nur den am besten passendsten Eintrag zurück. " \
+                "Keine weiteren Ausführungen oder Erklärungen. " \
+                "Antworte am besten in einem Wort. " \
+                "Hier der Dokumenteninhalt: "
+    Dokumententypen = ['Fotodokumentation', 'Projektdokumentation (PD)', 'Objektdokumentation (OD)',
+                       'Prozessdokumentation',  'Fachdokumentation', 'Anlagedokumentation']
+    ASH_Geschosse = ['U4', 'U3', 'U2', 'U1',
+                     'A', 'B', 'C', 'D', 'E', 'F', 'G']
+    #print(str(Baubranchen_Disziplinen))
+    main()

faiss_utils.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import os
+from langchain_community.vectorstores import FAISS
+from langchain.embeddings import OpenAIEmbeddings
+def embed(input_strings):
+    vectorstore = FAISS.from_texts(texts=input_strings, embedding=OpenAIEmbeddings())
+    return vectorstore
+# Function to save a FAISS vectorstore to a specified path
+def save_local(vectorstore, path="safe/"):
+    if not os.path.exists(path):
+        os.makedirs(path)
+    file_path = os.path.join(path, "faiss_index.index")
+    vectorstore.save_local(file_path)
+    print(f"FAISS vectorstore saved to {file_path}")
+# Function to load a FAISS vectorstore from a specified path
+def load_vectorstore(path):
+    embeddings = OpenAIEmbeddings()  # Needed to initialize the FAISS properly
+    vectorstore = FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True)
+    print(f"FAISS vectorstore loaded from {path}")
+    return vectorstore
+# Example usage
+if __name__ == "__main__":
+    # Embed a few words
+    words = ["hello", "world", "sample", "text"]
+    faiss_db1 = embed(words)
+    # Save the vectorstore
+    save_local(faiss_db1)
+    # Load the vectorstore
+    loaded_db1 = load_vectorstore("safe/faiss_index.index")
+    # Embed another set of words and create a second vectorstore
+    more_words = ["FAISS", "database", "information", "retrieval"]
+    faiss_db2 = embed(more_words)
+    loaded_db1.merge_from(faiss_db2)
+    print("Merged vectorstore with other vectorstore containing total vectors:", loaded_db1.index.ntotal)