Spaces:

elia-waefler
/

ki_inselspital

Runtime error

App Files Files Community

elia-waefler commited on May 14, 2024

Commit

0c4e36a

1 Parent(s): 6bccf6f

structure

Browse files

Files changed (14) hide show

.gitignore +0 -2
LICENSE +201 -201
app.py +2 -6
KBOB_Klassifizierung.xlsx → data/KBOB_Klassifizierung.xlsx +0 -0
requirements.txt +15 -15
ask_app.py → util/ask_app.py +243 -243
classify_app.py → util/classify_app.py +197 -197
ingest.py → util/ingest.py +126 -126
my_1_reader.py → util/my_1_reader.py +201 -201
my_1_writer.py → util/my_1_writer.py +0 -0
my_2_sim_search.py → util/my_2_sim_search.py +163 -163
my_new_openai.py → util/my_new_openai.py +151 -151
my_vectors.py → util/my_vectors.py +0 -0
setup_db.py → util/setup_db.py +50 -50

.gitignore CHANGED Viewed

@@ -120,9 +120,7 @@ celerybeat.pid
 # Environments
 .env
-.venv
 env/
-venv/
 ENV/
 env.bak/
 venv.bak/

 # Environments
 .env
 env/
 ENV/
 env.bak/
 venv.bak/

LICENSE CHANGED Viewed

@@ -1,201 +1,201 @@
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-   1. Definitions.
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-   END OF TERMS AND CONDITIONS
-   APPENDIX: How to apply the Apache License to your work.
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-   Copyright [yyyy] [name of copyright owner]
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-       http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

app.py CHANGED Viewed

@@ -1,14 +1,10 @@
 """
 testing my own vectors
 """
-import ingest
-import my_2_sim_search
-import my_new_openai
-import setup_db
 import time
 import streamlit as st
 import os
-import my_vectors
 def merge_indices(index1, index2):
@@ -148,7 +144,7 @@ def main():
                         st.warning("unsaved embeddings will be lost.")
         else:
             file = st.file_uploader("upload file", accept_multiple_files=False)
-            vec_store = setup_db.load_vectorstore_from_excel("KBOB_Klassifizierung.xlsx")
             if st.button("classify me!"):
                 with st.spinner("Classifying..."):
                     query_vecs = []

 """
 testing my own vectors
 """
+from util import setup_db, my_vectors, my_2_sim_search, ingest, my_new_openai
 import time
 import streamlit as st
 import os
 def merge_indices(index1, index2):
                         st.warning("unsaved embeddings will be lost.")
         else:
             file = st.file_uploader("upload file", accept_multiple_files=False)
+            vec_store = setup_db.load_vectorstore_from_excel("data/KBOB_Klassifizierung.xlsx")
             if st.button("classify me!"):
                 with st.spinner("Classifying..."):
                     query_vecs = []

KBOB_Klassifizierung.xlsx → data/KBOB_Klassifizierung.xlsx RENAMED Viewed

File without changes

requirements.txt CHANGED Viewed

@@ -1,16 +1,16 @@
-streamlit~=1.33.0
-bcrypt~=4.1.2
-psycopg2-binary~=2.9.9
-openai~=1.23.2
-pypdf2~=3.0.1
-langchain~=0.1.16
-tiktoken~=0.6.0
-numpy~=1.26.4
-requests~=2.31.0
-pandas~=2.2.2
-tabula~=1.0.5
-pdfplumber~=0.11.0
-PyMuPDF~=1.24.3
-fitz~=0.0.1.dev2
-pillow~=10.3.0
 openpyxl~=3.1.2

+streamlit~=1.33.0
+bcrypt~=4.1.2
+psycopg2-binary~=2.9.9
+openai~=1.23.2
+pypdf2~=3.0.1
+langchain~=0.1.16
+tiktoken~=0.6.0
+numpy~=1.26.4
+requests~=2.31.0
+pandas~=2.2.2
+tabula~=1.0.5
+pdfplumber~=0.11.0
+PyMuPDF~=1.24.3
+fitz~=0.0.1.dev2
+pillow~=10.3.0
 openpyxl~=3.1.2

ask_app.py → util/ask_app.py RENAMED Viewed

@@ -1,243 +1,243 @@
-"""
-complete, functional RAG App
-stores vectors in session state, or locally.
-add function to display retrieved documents
-"""
-# import time
-from datetime import datetime
-# import openai
-# import tiktoken
-import streamlit as st
-from PyPDF2 import PdfReader
-from langchain.text_splitter import CharacterTextSplitter
-from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
-from langchain.vectorstores import FAISS
-from langchain.chat_models import ChatOpenAI
-from langchain.memory import ConversationBufferMemory
-from langchain.chains import ConversationalRetrievalChain
-from html_templates import css, bot_template, user_template
-from langchain.llms import HuggingFaceHub
-import os
-import numpy as np
-import faiss_utils
-from langchain_community.vectorstores import FAISS
-from langchain.embeddings import OpenAIEmbeddings
-def merge_faiss_indices(index1, index2):
-    """
-    Merge two FAISS indices into a new index, assuming both are of the same type and dimensionality.
-    Args:
-    index1 (faiss.Index): The first FAISS index.
-    index2 (faiss.Index): The second FAISS index.
-    Returns:
-    faiss.Index: A new FAISS index containing all vectors from index1 and index2.
-    """
-    # Check if both indices are the same type
-    if type(index1) != type(index2):
-        raise ValueError("Indices are of different types")
-    # Check dimensionality
-    if index1.d != index2.d:
-        raise ValueError("Indices have different dimensionality")
-    # Determine type of indices
-    if isinstance(index1, FAISS.IndexFlatL2):
-        # Handle simple flat indices
-        d = index1.d
-        # Extract vectors from both indices
-        xb1 = FAISS.rev_swig_ptr(index1.xb.data(), index1.ntotal * d)
-        xb2 = FAISS.rev_swig_ptr(index2.xb.data(), index2.ntotal * d)
-        # Combine vectors
-        xb_combined = np.vstack((xb1, xb2))
-        # Create a new index and add combined vectors
-        new_index = FAISS.IndexFlatL2(d)
-        new_index.add(xb_combined)
-        return new_index
-    elif isinstance(index1, FAISS.IndexIVFFlat):
-        # Handle quantized indices (IndexIVFFlat)
-        d = index1.d
-        nlist = index1.nlist
-        quantizer = FAISS.IndexFlatL2(d)  # Re-create the appropriate quantizer
-        # Create a new index with the same configuration
-        new_index = FAISS.IndexIVFFlat(quantizer, d, nlist, FAISS.METRIC_L2)
-        # If the indices are already trained, you can directly add the vectors
-        # Otherwise, you may need to train new_index using a representative subset of vectors
-        vecs1 = FAISS.rev_swig_ptr(index1.xb.data(), index1.ntotal * d)
-        vecs2 = FAISS.rev_swig_ptr(index2.xb.data(), index2.ntotal * d)
-        new_index.add(vecs1)
-        new_index.add(vecs2)
-        return new_index
-    else:
-        raise TypeError("Index type not supported for merging in this function")
-def get_pdf_text(pdf_docs):
-    text = ""
-    for pdf in pdf_docs:
-        pdf_reader = PdfReader(pdf)
-        for page in pdf_reader.pages:
-            text += page.extract_text()
-    return text
-def get_text_chunks(text):
-    text_splitter = CharacterTextSplitter(
-        separator="\n",
-        chunk_size=1000,
-        chunk_overlap=200,
-        length_function=len
-    )
-    chunks = text_splitter.split_text(text)
-    return chunks
-def get_faiss_vectorstore(text_chunks):
-    if sst.openai:
-        my_embeddings = OpenAIEmbeddings()
-    else:
-        my_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
-    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=my_embeddings)
-    return vectorstore
-def get_conversation_chain(vectorstore):
-    if sst.openai:
-        llm = ChatOpenAI()
-    else:
-        llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature": 0.5, "max_length": 512})
-    memory = ConversationBufferMemory(
-        memory_key='chat_history', return_messages=True)
-    conversation_chain = ConversationalRetrievalChain.from_llm(
-        llm=llm,
-        retriever=vectorstore.as_retriever(),
-        memory=memory
-    )
-    return conversation_chain
-def handle_userinput(user_question):
-    response = sst.conversation({'question': user_question})
-    sst.chat_history = response['chat_history']
-    for i, message in enumerate(sst.chat_history):
-        # Display user message
-        if i % 2 == 0:
-            st.write(user_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
-        else:
-            print(message)
-            # Display AI response
-            st.write(bot_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
-            # Display source document information if available in the message
-            if hasattr(message, 'source') and message.source:
-                st.write(f"Source Document: {message.source}", unsafe_allow_html=True)
-if True:
-    BASE_URL = "https://api.vectara.io/v1"
-    OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
-    OPENAI_ORG_ID = os.environ["OPENAI_ORG_ID"]
-    PINECONE_API_KEY = os.environ["PINECONE_API_KEY_LCBIM"]
-    HUGGINGFACEHUB_API_TOKEN = os.environ["HUGGINGFACEHUB_API_TOKEN"]
-    VECTARA_API_KEY = os.environ["VECTARA_API_KEY"]
-    VECTARA_CUSTOMER_ID = os.environ["VECTARA_CUSTOMER_ID"]
-    headers = {"Authorization": f"Bearer {VECTARA_API_KEY}", "Content-Type": "application/json"}
-def main():
-    st.set_page_config(page_title="Anna Seiler Haus KI-Assistent", page_icon=":hospital:")
-    st.write(css, unsafe_allow_html=True)
-    if "conversation" not in sst:
-        sst.conversation = None
-    if "chat_history" not in sst:
-        sst.chat_history = None
-    if "page" not in sst:
-        sst.page = "home"
-    if "openai" not in sst:
-        sst.openai = True
-    if "login" not in sst:
-        sst.login = False
-    if 'submitted_user_query' not in sst:
-        sst.submitted_user_query = ''
-    if 'submitted_user_safe' not in sst:
-        sst.submitted_user_safe = ''
-    if 'submitted_user_load' not in sst:
-        sst.submitted_user_load = ''
-    def submit_user_query():
-        sst.submitted_user_query = sst.widget_user_query
-        sst.widget_user_query = ''
-    def submit_user_safe():
-        sst.submitted_user_safe = sst.widget_user_safe
-        sst.widget_user_safe = ''
-        if "vectorstore" in sst:
-            # faiss_name = str(datetime.now().strftime("%Y%m%d%H%M%S")) + "faiss_index"
-            faiss_utils.save_local(sst.vectorstore, path=sst.submitted_user_safe)
-            st.sidebar.success("saved")
-        else:
-            st.sidebar.warning("No embeddings to save. Please process documents first.")
-    def submit_user_load():
-        sst.submitted_user_load = sst.widget_user_load
-        sst.widget_user_load = ''
-        if os.path.exists(sst.submitted_user_load):
-            new_db = faiss_utils.load_vectorstore(f"{sst.submitted_user_load}/faiss_index.index")
-            if "vectorstore" in sst:
-                if new_db is not None:  # Check if this is working
-                    sst.vectorstore.merge_from(new_db)
-                    sst.conversation = get_conversation_chain(sst.vectorstore)
-                    st.sidebar.success("faiss loaded")
-            else:
-                if new_db is not None:  # Check if this is working
-                    sst.vectorstore = new_db
-                    sst.conversation = get_conversation_chain(new_db)
-                    st.sidebar.success("faiss loaded")
-        else:
-            st.sidebar.warning("Couldn't load/find embeddings")
-    st.header("Anna Seiler Haus KI-Assistent ASH :hospital:")
-    if st.text_input("ASK_ASH_PASSWORD: ", type="password") == ASK_ASH_PASSWORD:
-        #user_question = st.text_input("Ask a question about your documents:", key="user_query", on_change=handle_query)
-        st.text_input('Ask a question about your documents:', key='widget_user_query', on_change=submit_user_query)
-        #sst.openai = st.toggle(label="use openai?")
-        if sst.submitted_user_query:
-            if "vectorstore" in sst:
-                handle_userinput(sst.submitted_user_query)
-            else:
-                st.warning("no vectorstore loaded.")
-        with st.sidebar:
-            st.subheader("Your documents")
-            pdf_docs = st.file_uploader("Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
-            if st.button("Process"):
-                with st.spinner("Processing"):
-                    vec = get_faiss_vectorstore(get_text_chunks(get_pdf_text(pdf_docs)))
-                    sst.vectorstore = vec
-                    sst.conversation = get_conversation_chain(vec)
-                st.success("embedding complete")
-            st.text_input('Safe Embeddings to: (copy path of folder)', key='widget_user_safe',
-                          on_change=submit_user_safe)
-            st.text_input('Load Embeddings from: (copy path of folder)', key='widget_user_load',
-                          on_change=submit_user_load)
-if __name__ == '__main__':
-    sst = st.session_state
-    ASK_ASH_PASSWORD = os.environ["ASK_ASH_PASSWORD"]
-    main()

+"""
+complete, functional RAG App
+stores vectors in session state, or locally.
+add function to display retrieved documents
+"""
+# import time
+from datetime import datetime
+# import openai
+# import tiktoken
+import streamlit as st
+from PyPDF2 import PdfReader
+from langchain.text_splitter import CharacterTextSplitter
+from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
+from langchain.vectorstores import FAISS
+from langchain.chat_models import ChatOpenAI
+from langchain.memory import ConversationBufferMemory
+from langchain.chains import ConversationalRetrievalChain
+from html_templates import css, bot_template, user_template
+from langchain.llms import HuggingFaceHub
+import os
+import numpy as np
+import faiss_utils
+from langchain_community.vectorstores import FAISS
+from langchain.embeddings import OpenAIEmbeddings
+def merge_faiss_indices(index1, index2):
+    """
+    Merge two FAISS indices into a new index, assuming both are of the same type and dimensionality.
+    Args:
+    index1 (faiss.Index): The first FAISS index.
+    index2 (faiss.Index): The second FAISS index.
+    Returns:
+    faiss.Index: A new FAISS index containing all vectors from index1 and index2.
+    """
+    # Check if both indices are the same type
+    if type(index1) != type(index2):
+        raise ValueError("Indices are of different types")
+    # Check dimensionality
+    if index1.d != index2.d:
+        raise ValueError("Indices have different dimensionality")
+    # Determine type of indices
+    if isinstance(index1, FAISS.IndexFlatL2):
+        # Handle simple flat indices
+        d = index1.d
+        # Extract vectors from both indices
+        xb1 = FAISS.rev_swig_ptr(index1.xb.data(), index1.ntotal * d)
+        xb2 = FAISS.rev_swig_ptr(index2.xb.data(), index2.ntotal * d)
+        # Combine vectors
+        xb_combined = np.vstack((xb1, xb2))
+        # Create a new index and add combined vectors
+        new_index = FAISS.IndexFlatL2(d)
+        new_index.add(xb_combined)
+        return new_index
+    elif isinstance(index1, FAISS.IndexIVFFlat):
+        # Handle quantized indices (IndexIVFFlat)
+        d = index1.d
+        nlist = index1.nlist
+        quantizer = FAISS.IndexFlatL2(d)  # Re-create the appropriate quantizer
+        # Create a new index with the same configuration
+        new_index = FAISS.IndexIVFFlat(quantizer, d, nlist, FAISS.METRIC_L2)
+        # If the indices are already trained, you can directly add the vectors
+        # Otherwise, you may need to train new_index using a representative subset of vectors
+        vecs1 = FAISS.rev_swig_ptr(index1.xb.data(), index1.ntotal * d)
+        vecs2 = FAISS.rev_swig_ptr(index2.xb.data(), index2.ntotal * d)
+        new_index.add(vecs1)
+        new_index.add(vecs2)
+        return new_index
+    else:
+        raise TypeError("Index type not supported for merging in this function")
+def get_pdf_text(pdf_docs):
+    text = ""
+    for pdf in pdf_docs:
+        pdf_reader = PdfReader(pdf)
+        for page in pdf_reader.pages:
+            text += page.extract_text()
+    return text
+def get_text_chunks(text):
+    text_splitter = CharacterTextSplitter(
+        separator="\n",
+        chunk_size=1000,
+        chunk_overlap=200,
+        length_function=len
+    )
+    chunks = text_splitter.split_text(text)
+    return chunks
+def get_faiss_vectorstore(text_chunks):
+    if sst.openai:
+        my_embeddings = OpenAIEmbeddings()
+    else:
+        my_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
+    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=my_embeddings)
+    return vectorstore
+def get_conversation_chain(vectorstore):
+    if sst.openai:
+        llm = ChatOpenAI()
+    else:
+        llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature": 0.5, "max_length": 512})
+    memory = ConversationBufferMemory(
+        memory_key='chat_history', return_messages=True)
+    conversation_chain = ConversationalRetrievalChain.from_llm(
+        llm=llm,
+        retriever=vectorstore.as_retriever(),
+        memory=memory
+    )
+    return conversation_chain
+def handle_userinput(user_question):
+    response = sst.conversation({'question': user_question})
+    sst.chat_history = response['chat_history']
+    for i, message in enumerate(sst.chat_history):
+        # Display user message
+        if i % 2 == 0:
+            st.write(user_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
+        else:
+            print(message)
+            # Display AI response
+            st.write(bot_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
+            # Display source document information if available in the message
+            if hasattr(message, 'source') and message.source:
+                st.write(f"Source Document: {message.source}", unsafe_allow_html=True)
+if True:
+    BASE_URL = "https://api.vectara.io/v1"
+    OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
+    OPENAI_ORG_ID = os.environ["OPENAI_ORG_ID"]
+    PINECONE_API_KEY = os.environ["PINECONE_API_KEY_LCBIM"]
+    HUGGINGFACEHUB_API_TOKEN = os.environ["HUGGINGFACEHUB_API_TOKEN"]
+    VECTARA_API_KEY = os.environ["VECTARA_API_KEY"]
+    VECTARA_CUSTOMER_ID = os.environ["VECTARA_CUSTOMER_ID"]
+    headers = {"Authorization": f"Bearer {VECTARA_API_KEY}", "Content-Type": "application/json"}
+def main():
+    st.set_page_config(page_title="Anna Seiler Haus KI-Assistent", page_icon=":hospital:")
+    st.write(css, unsafe_allow_html=True)
+    if "conversation" not in sst:
+        sst.conversation = None
+    if "chat_history" not in sst:
+        sst.chat_history = None
+    if "page" not in sst:
+        sst.page = "home"
+    if "openai" not in sst:
+        sst.openai = True
+    if "login" not in sst:
+        sst.login = False
+    if 'submitted_user_query' not in sst:
+        sst.submitted_user_query = ''
+    if 'submitted_user_safe' not in sst:
+        sst.submitted_user_safe = ''
+    if 'submitted_user_load' not in sst:
+        sst.submitted_user_load = ''
+    def submit_user_query():
+        sst.submitted_user_query = sst.widget_user_query
+        sst.widget_user_query = ''
+    def submit_user_safe():
+        sst.submitted_user_safe = sst.widget_user_safe
+        sst.widget_user_safe = ''
+        if "vectorstore" in sst:
+            # faiss_name = str(datetime.now().strftime("%Y%m%d%H%M%S")) + "faiss_index"
+            faiss_utils.save_local(sst.vectorstore, path=sst.submitted_user_safe)
+            st.sidebar.success("saved")
+        else:
+            st.sidebar.warning("No embeddings to save. Please process documents first.")
+    def submit_user_load():
+        sst.submitted_user_load = sst.widget_user_load
+        sst.widget_user_load = ''
+        if os.path.exists(sst.submitted_user_load):
+            new_db = faiss_utils.load_vectorstore(f"{sst.submitted_user_load}/faiss_index.index")
+            if "vectorstore" in sst:
+                if new_db is not None:  # Check if this is working
+                    sst.vectorstore.merge_from(new_db)
+                    sst.conversation = get_conversation_chain(sst.vectorstore)
+                    st.sidebar.success("faiss loaded")
+            else:
+                if new_db is not None:  # Check if this is working
+                    sst.vectorstore = new_db
+                    sst.conversation = get_conversation_chain(new_db)
+                    st.sidebar.success("faiss loaded")
+        else:
+            st.sidebar.warning("Couldn't load/find embeddings")
+    st.header("Anna Seiler Haus KI-Assistent ASH :hospital:")
+    if st.text_input("ASK_ASH_PASSWORD: ", type="password") == ASK_ASH_PASSWORD:
+        #user_question = st.text_input("Ask a question about your documents:", key="user_query", on_change=handle_query)
+        st.text_input('Ask a question about your documents:', key='widget_user_query', on_change=submit_user_query)
+        #sst.openai = st.toggle(label="use openai?")
+        if sst.submitted_user_query:
+            if "vectorstore" in sst:
+                handle_userinput(sst.submitted_user_query)
+            else:
+                st.warning("no vectorstore loaded.")
+        with st.sidebar:
+            st.subheader("Your documents")
+            pdf_docs = st.file_uploader("Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
+            if st.button("Process"):
+                with st.spinner("Processing"):
+                    vec = get_faiss_vectorstore(get_text_chunks(get_pdf_text(pdf_docs)))
+                    sst.vectorstore = vec
+                    sst.conversation = get_conversation_chain(vec)
+                st.success("embedding complete")
+            st.text_input('Safe Embeddings to: (copy path of folder)', key='widget_user_safe',
+                          on_change=submit_user_safe)
+            st.text_input('Load Embeddings from: (copy path of folder)', key='widget_user_load',
+                          on_change=submit_user_load)
+if __name__ == '__main__':
+    sst = st.session_state
+    ASK_ASH_PASSWORD = os.environ["ASK_ASH_PASSWORD"]
+    main()

classify_app.py → util/classify_app.py RENAMED Viewed

@@ -1,197 +1,197 @@
-import streamlit as st
-import os
-# import openai
-from PyPDF2 import PdfReader
-from openai import OpenAI
-from langchain.chat_models import ChatOpenAI
-ASK_ASH_PASSWORD = os.environ["ASK_ASH_PASSWORD"]
-def gpt4_new(prompt_text):
-    client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))
-    response = client.chat.completions.create(
-        model="gpt-4",
-        messages=[{"role": "system",
-                   "content":   "Du bist eine Maschine, auf Grund des Texts von PDF-Dokumenten,"
-                                "das Dokument in vorgegebene Kategorien klassifiziert."
-                                "Du gibts möglichst kurze Antworten, am besten ein Wort"
-                                "Du gibst keine Erklärungen oder Begründungen. "
-                                "Du klassifizierst nur nach den vorgegebenen Kategorien."
-                                "Wenn ein Dokument partout nicht klassifizierbar ist, "
-                                "antwortest du mit '<no classification>'"},
-                  {"role": "user", "content": prompt_text}])
-    return response.choices[0].message.content
-# Define a function to ask a question to GPT-4
-def ask_gpt4(question):
-    print(question)  # we don't have to submit the question?
-    try:
-        # Use the chat function to send a message and get a response
-        response = ChatOpenAI()
-        # Extract the response text
-        return response["choices"][0]["message"]["content"]
-    except Exception as e:
-        # Handle exceptions that may occur during the API call
-        return str(e)
-def process_prompts_and_save(my_prompts):
-    # Ensure the responses list is empty initially
-    responses = []
-    # Loop through each prompt in the list
-    for prompt in my_prompts:
-        try:
-            # ADD LOGIC TO READ FILE AND CLASSIFY
-            # Generate response for each prompt and append to the list
-            response = ask_gpt4(prompt)
-            sol = f"{prompt}\n\n{response}\n\n\n\n"
-            print(sol)
-            responses.append(sol)
-        except Exception as e:
-            # In case of an error, log the error with the prompt
-            responses.append(f"{prompt}\n\nError:{str(e)}\n\n\n\n")
-    # Writing all responses to a text file
-    with open('gpt4_responses.txt', 'w', encoding='utf-8') as file:
-        file.writelines(responses)
-def get_pdfs_text(pdf_docs):
-    text = ""
-    for pdf in pdf_docs:
-        pdf_reader = PdfReader(pdf)
-        for page in pdf_reader.pages:
-            text += page.extract_text()
-    return text
-def get_pdf_text(pdf_document):
-    text = ""
-    pdf_reader = PdfReader(pdf_document)
-    for page in pdf_reader.pages:
-        text += page.extract_text()
-    return text
-def json_open(filename):
-    with open(filename, "r") as f:
-        mydata = f.read()
-    return mydata
-def main():
-    st.title("Doc Classifier")
-    l, r = st.columns(2)
-    if st.toggle("show README"):
-        st.subheader("Funktion: ")
-        st.write("der Doc Classifier von Elia Wäfler kann einige der BIM2FM Dokumente")
-        st.write("des ASH nach Disziplin, Doc typ. und Geschoss (später KBOB) klassifizieren.")
-        st.write("lade ein oder mehrere PDF-Dokumente hoch, um es auszuprobieren.")
-        st.write("Feedback und Bugs gerne an elia.waefler@insel.ch")
-        st.write("Vielen Dank.")
-        st.write("")
-        with l:
-            st.subheader("Limitationen: ")
-            st.write("bisher nur PDFs")
-            st.write("nur Disziplin, Doc typ. und Geschoss")
-            st.write("macht teilweise Fehler, vor allem bei Koordination, Datennetz usw, (unklare Disziplinen)")
-            st.write("")
-        with r:
-            st.subheader("geplante Erweiterungen:")
-            st.write("Text Beschreibung wird von AI hinzugefügt")
-            st.write("jpg, bilder, tabellen, .xlsx, .docx alles möglich, nicht nur PDF/Text")
-            st.write("Ecodomus API einbinden, um alle Dokumente zu überprüfen.")
-    if st.text_input("ASK_ASH_PASSWORD: ", type="password") == ASK_ASH_PASSWORD:
-        uploaded_files = st.file_uploader("PDF Dokument", accept_multiple_files=True)
-        #print(uploaded_file)
-        #print(uploaded_file.name)
-        if st.button("classify KBOB!"):
-            if uploaded_files is not None:
-                with st.container():
-                    # col1, col2, col3, col4, col5 = st.columns(5)
-                    col1, col2, col3 = st.columns(3)
-                    all_metadata = []
-                    with col1:
-                        st.write("Disziplin")
-                        st.write(f"")
-                    with col2:
-                        st.write("Dokumententyp")
-                        st.write(f"")
-                    with col3:
-                        st.write("Geschoss")
-                        st.write(f"")
-                    for file in uploaded_files:
-                        metadata = [file.name]
-                        with col1:
-                            with st.spinner("GPT4 at work"):
-                                pdf_text = str(get_pdf_text(file))
-                                prompt_1 = auftrag_0 + auftrag_1_disziplin + str(Baubranchen_Disziplinen) + pdf_text
-                                answer_1 = gpt4_new(prompt_1)
-                                print(prompt_1)
-                                metadata.append(answer_1)
-                            st.write(answer_1)
-                        with col2:
-                            with st.spinner("GPT4 at work"):
-                                prompt_2 = auftrag_0 + auftrag_1_type + str(Dokumententypen) + pdf_text
-                                answer_2 = gpt4_new(prompt_2)
-                                print(prompt_2)
-                                metadata.append(answer_2)
-                            st.write(answer_2)
-                        with col3:
-                            with st.spinner("GPT4 at work"):
-                                prompt_3 = auftrag_0 + auftrag_1_ge + str(ASH_Geschosse) + pdf_text
-                                answer_3 = gpt4_new(prompt_3)
-                                print(prompt_3)
-                                metadata.append(answer_2)
-                            st.write(answer_3)
-                        all_metadata.append(metadata)
-                    metadata_filename = "ai_generated_metadata.txt"
-                    with open(metadata_filename, 'w', encoding='utf-8') as f:
-                        for line in all_metadata:
-                            f.writelines("\n")
-                            for item in line:
-                                f.writelines(item)
-                                f.writelines(";")
-                            f.writelines("\n")
-                    st.success("classified, saved")
-                    st.download_button(f"Download Metadata", json_open(metadata_filename), file_name=metadata_filename)
-            else:
-                st.warning("no file")
-if __name__ == "__main__":
-    #prompts = ["classify the document, tell me the ", "hello"]
-    #process_prompts_and_save(prompts)
-    auftrag_0 = "Klassifiziere dieses Dokument nach "
-    auftrag_1_disziplin = "diesen 'Baubranchen Disziplinen': "
-    auftrag_1_type = "diesen 'Dokumententypen': "
-    auftrag_1_ge = "diesen 'Geschossen': "
-    Baubranchen_Disziplinen = ['A-Architektur', 'B-Bauphysik', 'C-Rohrpostanlagen', 'D-Datennetz', 'E-Elektroanlagen',
-                               'F-Fassadenplanung', 'G-Küche', 'H-Heizung', 'I-Innenausbau', 'K-Kälte', 'L-Lüftung',
-                               'M-Medizintechnik', 'N-Fördertechnik', 'O-Gebäudebetrieb', 'P-Sprinkler',
-                               'Q-Brandschutz', 'R-Koordination', 'S-Sanitär', 'T-Tragwerksplanung', 'W-Informatik',
-                               'Z-Lichtplanung']
-    auftrag_2 = "gib nur den am besten passendsten Eintrag zurück. " \
-                "Keine weiteren Ausführungen oder Erklärungen. " \
-                "Antworte am besten in einem Wort. " \
-                "Hier der Dokumenteninhalt: "
-    Dokumententypen = ['Fotodokumentation', 'Projektdokumentation (PD)', 'Objektdokumentation (OD)',
-                       'Prozessdokumentation',  'Fachdokumentation', 'Anlagedokumentation']
-    ASH_Geschosse = ['U4', 'U3', 'U2', 'U1',
-                     'A', 'B', 'C', 'D', 'E', 'F', 'G']
-    #print(str(Baubranchen_Disziplinen))
-    main()

+import streamlit as st
+import os
+# import openai
+from PyPDF2 import PdfReader
+from openai import OpenAI
+from langchain.chat_models import ChatOpenAI
+ASK_ASH_PASSWORD = os.environ["ASK_ASH_PASSWORD"]
+def gpt4_new(prompt_text):
+    client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))
+    response = client.chat.completions.create(
+        model="gpt-4",
+        messages=[{"role": "system",
+                   "content":   "Du bist eine Maschine, auf Grund des Texts von PDF-Dokumenten,"
+                                "das Dokument in vorgegebene Kategorien klassifiziert."
+                                "Du gibts möglichst kurze Antworten, am besten ein Wort"
+                                "Du gibst keine Erklärungen oder Begründungen. "
+                                "Du klassifizierst nur nach den vorgegebenen Kategorien."
+                                "Wenn ein Dokument partout nicht klassifizierbar ist, "
+                                "antwortest du mit '<no classification>'"},
+                  {"role": "user", "content": prompt_text}])
+    return response.choices[0].message.content
+# Define a function to ask a question to GPT-4
+def ask_gpt4(question):
+    print(question)  # we don't have to submit the question?
+    try:
+        # Use the chat function to send a message and get a response
+        response = ChatOpenAI()
+        # Extract the response text
+        return response["choices"][0]["message"]["content"]
+    except Exception as e:
+        # Handle exceptions that may occur during the API call
+        return str(e)
+def process_prompts_and_save(my_prompts):
+    # Ensure the responses list is empty initially
+    responses = []
+    # Loop through each prompt in the list
+    for prompt in my_prompts:
+        try:
+            # ADD LOGIC TO READ FILE AND CLASSIFY
+            # Generate response for each prompt and append to the list
+            response = ask_gpt4(prompt)
+            sol = f"{prompt}\n\n{response}\n\n\n\n"
+            print(sol)
+            responses.append(sol)
+        except Exception as e:
+            # In case of an error, log the error with the prompt
+            responses.append(f"{prompt}\n\nError:{str(e)}\n\n\n\n")
+    # Writing all responses to a text file
+    with open('gpt4_responses.txt', 'w', encoding='utf-8') as file:
+        file.writelines(responses)
+def get_pdfs_text(pdf_docs):
+    text = ""
+    for pdf in pdf_docs:
+        pdf_reader = PdfReader(pdf)
+        for page in pdf_reader.pages:
+            text += page.extract_text()
+    return text
+def get_pdf_text(pdf_document):
+    text = ""
+    pdf_reader = PdfReader(pdf_document)
+    for page in pdf_reader.pages:
+        text += page.extract_text()
+    return text
+def json_open(filename):
+    with open(filename, "r") as f:
+        mydata = f.read()
+    return mydata
+def main():
+    st.title("Doc Classifier")
+    l, r = st.columns(2)
+    if st.toggle("show README"):
+        st.subheader("Funktion: ")
+        st.write("der Doc Classifier von Elia Wäfler kann einige der BIM2FM Dokumente")
+        st.write("des ASH nach Disziplin, Doc typ. und Geschoss (später KBOB) klassifizieren.")
+        st.write("lade ein oder mehrere PDF-Dokumente hoch, um es auszuprobieren.")
+        st.write("Feedback und Bugs gerne an elia.waefler@insel.ch")
+        st.write("Vielen Dank.")
+        st.write("")
+        with l:
+            st.subheader("Limitationen: ")
+            st.write("bisher nur PDFs")
+            st.write("nur Disziplin, Doc typ. und Geschoss")
+            st.write("macht teilweise Fehler, vor allem bei Koordination, Datennetz usw, (unklare Disziplinen)")
+            st.write("")
+        with r:
+            st.subheader("geplante Erweiterungen:")
+            st.write("Text Beschreibung wird von AI hinzugefügt")
+            st.write("jpg, bilder, tabellen, .xlsx, .docx alles möglich, nicht nur PDF/Text")
+            st.write("Ecodomus API einbinden, um alle Dokumente zu überprüfen.")
+    if st.text_input("ASK_ASH_PASSWORD: ", type="password") == ASK_ASH_PASSWORD:
+        uploaded_files = st.file_uploader("PDF Dokument", accept_multiple_files=True)
+        #print(uploaded_file)
+        #print(uploaded_file.name)
+        if st.button("classify KBOB!"):
+            if uploaded_files is not None:
+                with st.container():
+                    # col1, col2, col3, col4, col5 = st.columns(5)
+                    col1, col2, col3 = st.columns(3)
+                    all_metadata = []
+                    with col1:
+                        st.write("Disziplin")
+                        st.write(f"")
+                    with col2:
+                        st.write("Dokumententyp")
+                        st.write(f"")
+                    with col3:
+                        st.write("Geschoss")
+                        st.write(f"")
+                    for file in uploaded_files:
+                        metadata = [file.name]
+                        with col1:
+                            with st.spinner("GPT4 at work"):
+                                pdf_text = str(get_pdf_text(file))
+                                prompt_1 = auftrag_0 + auftrag_1_disziplin + str(Baubranchen_Disziplinen) + pdf_text
+                                answer_1 = gpt4_new(prompt_1)
+                                print(prompt_1)
+                                metadata.append(answer_1)
+                            st.write(answer_1)
+                        with col2:
+                            with st.spinner("GPT4 at work"):
+                                prompt_2 = auftrag_0 + auftrag_1_type + str(Dokumententypen) + pdf_text
+                                answer_2 = gpt4_new(prompt_2)
+                                print(prompt_2)
+                                metadata.append(answer_2)
+                            st.write(answer_2)
+                        with col3:
+                            with st.spinner("GPT4 at work"):
+                                prompt_3 = auftrag_0 + auftrag_1_ge + str(ASH_Geschosse) + pdf_text
+                                answer_3 = gpt4_new(prompt_3)
+                                print(prompt_3)
+                                metadata.append(answer_2)
+                            st.write(answer_3)
+                        all_metadata.append(metadata)
+                    metadata_filename = "ai_generated_metadata.txt"
+                    with open(metadata_filename, 'w', encoding='utf-8') as f:
+                        for line in all_metadata:
+                            f.writelines("\n")
+                            for item in line:
+                                f.writelines(item)
+                                f.writelines(";")
+                            f.writelines("\n")
+                    st.success("classified, saved")
+                    st.download_button(f"Download Metadata", json_open(metadata_filename), file_name=metadata_filename)
+            else:
+                st.warning("no file")
+if __name__ == "__main__":
+    #prompts = ["classify the document, tell me the ", "hello"]
+    #process_prompts_and_save(prompts)
+    auftrag_0 = "Klassifiziere dieses Dokument nach "
+    auftrag_1_disziplin = "diesen 'Baubranchen Disziplinen': "
+    auftrag_1_type = "diesen 'Dokumententypen': "
+    auftrag_1_ge = "diesen 'Geschossen': "
+    Baubranchen_Disziplinen = ['A-Architektur', 'B-Bauphysik', 'C-Rohrpostanlagen', 'D-Datennetz', 'E-Elektroanlagen',
+                               'F-Fassadenplanung', 'G-Küche', 'H-Heizung', 'I-Innenausbau', 'K-Kälte', 'L-Lüftung',
+                               'M-Medizintechnik', 'N-Fördertechnik', 'O-Gebäudebetrieb', 'P-Sprinkler',
+                               'Q-Brandschutz', 'R-Koordination', 'S-Sanitär', 'T-Tragwerksplanung', 'W-Informatik',
+                               'Z-Lichtplanung']
+    auftrag_2 = "gib nur den am besten passendsten Eintrag zurück. " \
+                "Keine weiteren Ausführungen oder Erklärungen. " \
+                "Antworte am besten in einem Wort. " \
+                "Hier der Dokumenteninhalt: "
+    Dokumententypen = ['Fotodokumentation', 'Projektdokumentation (PD)', 'Objektdokumentation (OD)',
+                       'Prozessdokumentation',  'Fachdokumentation', 'Anlagedokumentation']
+    ASH_Geschosse = ['U4', 'U3', 'U2', 'U1',
+                     'A', 'B', 'C', 'D', 'E', 'F', 'G']
+    #print(str(Baubranchen_Disziplinen))
+    main()

ingest.py → util/ingest.py RENAMED Viewed

@@ -1,126 +1,126 @@
-from PyPDF2 import PdfReader
-from langchain.text_splitter import CharacterTextSplitter
-import tabula
-import io
-import fitz  # PyMuPDF
-import pdfplumber
-from PIL import Image
-import io
-def get_pdf_tables(pdf_bytes):
-    """
-    Extracts tables from a PDF file loaded directly from bytes.
-    Args:
-    pdf_bytes (bytes): The byte content of the PDF file.
-    Returns:
-    List[pandas.DataFrame]: A list of DataFrames, each representing a table extracted from the PDF.
-    """
-    tables = []
-    with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
-        for page in pdf.pages:
-            # Extract tables from the current page
-            page_tables = page.extract_tables()
-            for table in page_tables:
-                # Convert table to a DataFrame and append to the list
-                tables.append(table)
-    # Optionally convert lists of lists (tables) to pandas DataFrames
-    import pandas as pd
-    dataframes = [pd.DataFrame(table[1:], columns=table[0]) for table in tables if table]
-    return dataframes
-def get_pdf_images(pdf_bytes):
-    """
-    Extracts images and captures screenshots of each page from a given PDF's bytes.
-    Args:
-    pdf_bytes (bytes): The byte content of the PDF file.
-    Returns:
-    List[bytes]: A list of image bytes extracted from the PDF, including screenshots of each page.
-    """
-    images = []
-    pdf_stream = io.BytesIO(pdf_bytes)
-    doc = fitz.open("pdf", pdf_stream.read())
-    for page_num, page in enumerate(doc):
-        # Take a screenshot of the current page
-        pix = page.get_pixmap()  # This line captures the page as an image
-        img_bytes = pix.tobytes("png")  # Save the pixmap as PNG bytes
-        images.append(img_bytes)  # Append the screenshot to the list of images
-        # Extract embedded images
-        for img_index, img in enumerate(page.get_images(full=True)):
-            xref = img[0]
-            base_image = doc.extract_image(xref)
-            image_bytes = base_image["image"]
-            images.append(image_bytes)
-    doc.close()
-    return images
-def get_pdf_old_tables(pdf_bytes):
-    """
-    Extracts tables from a given PDF's bytes using Tabula.
-    Args:
-    pdf_bytes (bytes): The byte content of the PDF file.
-    Returns:
-    List[pandas.DataFrame]: A list of DataFrames, each representing a table extracted from the PDF.
-    """
-    pdf_stream = io.BytesIO(pdf_bytes)
-    # Read PDF into list of DataFrame
-    tables = tabula.read_pdf(pdf_stream, pages='all', multiple_tables=True)
-    return tables
-def get_pdf_text(pdf_docs):
-    text = ""
-    if type(pdf_docs) == list:
-        for pdf in pdf_docs:
-            pdf_reader = PdfReader(pdf)
-            for page in pdf_reader.pages:
-                text += page.extract_text()
-    else:
-        pdf_reader = PdfReader(pdf_docs)
-        for page in pdf_reader.pages:
-            text += page.extract_text()
-    return text
-def get_text_chunks(text):
-    text_splitter = CharacterTextSplitter(
-        separator="\n",
-        chunk_size=1000,
-        chunk_overlap=200,
-        length_function=len
-    )
-    chunks = text_splitter.split_text(text)
-    return chunks
-def extract_images_from_pdf_path(pdf_path):
-    doc = fitz.open(pdf_path)
-    images = []
-    for i in range(len(doc)):
-        for img in doc.get_page_images(i):
-            xref = img[0]
-            base = img[1]
-            img_data = doc.extract_image(xref)
-            img_bytes = img_data['image']
-            image = Image.open(io.BytesIO(img_bytes))
-            images.append(image)
-    return images
-def get_tables_from_pdf_path(pdf_path):
-    # read_pdf will save the pdf table into Pandas Dataframe
-    tables = tabula.read_pdf(pdf_path, pages='all')
-    return tables

+from PyPDF2 import PdfReader
+from langchain.text_splitter import CharacterTextSplitter
+import tabula
+import io
+import fitz  # PyMuPDF
+import pdfplumber
+from PIL import Image
+import io
+def get_pdf_tables(pdf_bytes):
+    """
+    Extracts tables from a PDF file loaded directly from bytes.
+    Args:
+    pdf_bytes (bytes): The byte content of the PDF file.
+    Returns:
+    List[pandas.DataFrame]: A list of DataFrames, each representing a table extracted from the PDF.
+    """
+    tables = []
+    with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
+        for page in pdf.pages:
+            # Extract tables from the current page
+            page_tables = page.extract_tables()
+            for table in page_tables:
+                # Convert table to a DataFrame and append to the list
+                tables.append(table)
+    # Optionally convert lists of lists (tables) to pandas DataFrames
+    import pandas as pd
+    dataframes = [pd.DataFrame(table[1:], columns=table[0]) for table in tables if table]
+    return dataframes
+def get_pdf_images(pdf_bytes):
+    """
+    Extracts images and captures screenshots of each page from a given PDF's bytes.
+    Args:
+    pdf_bytes (bytes): The byte content of the PDF file.
+    Returns:
+    List[bytes]: A list of image bytes extracted from the PDF, including screenshots of each page.
+    """
+    images = []
+    pdf_stream = io.BytesIO(pdf_bytes)
+    doc = fitz.open("pdf", pdf_stream.read())
+    for page_num, page in enumerate(doc):
+        # Take a screenshot of the current page
+        pix = page.get_pixmap()  # This line captures the page as an image
+        img_bytes = pix.tobytes("png")  # Save the pixmap as PNG bytes
+        images.append(img_bytes)  # Append the screenshot to the list of images
+        # Extract embedded images
+        for img_index, img in enumerate(page.get_images(full=True)):
+            xref = img[0]
+            base_image = doc.extract_image(xref)
+            image_bytes = base_image["image"]
+            images.append(image_bytes)
+    doc.close()
+    return images
+def get_pdf_old_tables(pdf_bytes):
+    """
+    Extracts tables from a given PDF's bytes using Tabula.
+    Args:
+    pdf_bytes (bytes): The byte content of the PDF file.
+    Returns:
+    List[pandas.DataFrame]: A list of DataFrames, each representing a table extracted from the PDF.
+    """
+    pdf_stream = io.BytesIO(pdf_bytes)
+    # Read PDF into list of DataFrame
+    tables = tabula.read_pdf(pdf_stream, pages='all', multiple_tables=True)
+    return tables
+def get_pdf_text(pdf_docs):
+    text = ""
+    if type(pdf_docs) == list:
+        for pdf in pdf_docs:
+            pdf_reader = PdfReader(pdf)
+            for page in pdf_reader.pages:
+                text += page.extract_text()
+    else:
+        pdf_reader = PdfReader(pdf_docs)
+        for page in pdf_reader.pages:
+            text += page.extract_text()
+    return text
+def get_text_chunks(text):
+    text_splitter = CharacterTextSplitter(
+        separator="\n",
+        chunk_size=1000,
+        chunk_overlap=200,
+        length_function=len
+    )
+    chunks = text_splitter.split_text(text)
+    return chunks
+def extract_images_from_pdf_path(pdf_path):
+    doc = fitz.open(pdf_path)
+    images = []
+    for i in range(len(doc)):
+        for img in doc.get_page_images(i):
+            xref = img[0]
+            base = img[1]
+            img_data = doc.extract_image(xref)
+            img_bytes = img_data['image']
+            image = Image.open(io.BytesIO(img_bytes))
+            images.append(image)
+    return images
+def get_tables_from_pdf_path(pdf_path):
+    # read_pdf will save the pdf table into Pandas Dataframe
+    tables = tabula.read_pdf(pdf_path, pages='all')
+    return tables

my_1_reader.py → util/my_1_reader.py RENAMED Viewed

@@ -1,201 +1,201 @@
-# MUSS AUFGERÄUMT WERDEN
-import json
-import os
-import subprocess
-import PyPDF2
-import csv
-import fitz  # PyMuPDF
-def extract_text_from_pdf(pdf_path):
-    """
-    Extracts all text from a PDF file.
-    :param pdf_path: Path to the PDF file.
-    :return: Extracted text as a string.
-    """
-    # Open the PDF file
-    doc = fitz.open(pdf_path)
-    # Initialize an empty string to hold the text
-    text = ''
-    # Iterate through each page in the PDF
-    for page_num in range(len(doc)):
-        # Get a page
-        page = doc.load_page(page_num)
-        # Extract text from the page and add it to the result
-        text += page.get_text()
-    # Close the document
-    doc.close()
-    return text
-def read_pdfs_from_folder(folder_path):
-    """
-    Reads all PDF files in the specified folder using PdfReader and extracts their text.
-    Parameters:
-    - folder_path: The path to the folder containing PDF files.
-    Returns:
-    - A dictionary with file names as keys and their extracted text as values.
-    """
-    pdf_texts = {}
-    for filename in os.listdir(folder_path):
-        if filename.endswith('.pdf'):
-            file_path = os.path.join(folder_path, filename)
-            with open(file_path, 'rb') as pdf_file:
-                pdf_reader = PyPDF2.PdfReader(pdf_file)
-                text = ''
-                for page in pdf_reader.pages:
-                    try:
-                        text += page.extract_text()
-                    except UnicodeDecodeError as e:
-                        print(e)
-                for c in text:
-                    if c in ["ä", "Ä"]:
-                        text = text[:text.index(c)] + "ae" + text[text.index(c)+1:]
-                    if c in ["ö", "Ö"]:
-                        text = text[:text.index(c)] + "oe" + text[text.index(c)+1:]
-                    if c in ["ü", "Ü"]:
-                        text = text[:text.index(c)] + "ue" + text[text.index(c)+1:]
-                    if c in [",", ";", "\\", '"']:
-                        text = text[:text.index(c)] + "_" + text[text.index(c)+1:]
-                    if c in ["/n", "\n"]:
-                        text = text[:text.index(c)] + "<newline>" + text[text.index(c) + 1:]
-                pdf_texts[filename] = text
-    return pdf_texts
-def read_csv_lines_as_strings(filename):
-    """
-    Opens a CSV file and returns each line as a string in a list.
-    Parameters:
-    - filename: The path to the CSV file.
-    Returns:
-    - A list of strings, each representing a line from the CSV file.
-    """
-    lines_as_strings = []
-    with open(filename, newline='') as csvfile:
-        try:
-            reader = csv.reader(csvfile)
-            for row in reader:
-                # Convert the row (a list of values) back into a comma-separated string
-                line_as_string = ','.join(row)
-                lines_as_strings.append(line_as_string)
-        except UnicodeDecodeError as e:
-            print(e)
-    return lines_as_strings
-# Function to load data from JSON files
-def load_data(filename):
-    with open(filename, 'r') as file:
-        try:
-            return json.load(file)
-        except UnicodeDecodeError as err:
-            print(err)
-            return {}
-def find_and_open_file(filename, start_directory):
-    """
-    Attempts to open a file with the given filename starting from the specified directory.
-    If the file is not found, searches recursively in all subfolders. Works across macOS, Linux, and Windows.
-    """
-    for root, dirs, files in os.walk(start_directory):
-        if filename in files:
-            filepath = os.path.join(root, filename)
-            print(f"File found: {filepath}")
-            return filepath
-    print(f"File {filename} not found.")
-    return None
-def open_file(filepath):
-    """
-    Opens the file with the default application, based on the operating system.
-    """
-    if os.path.exists(filepath):
-        if os.name == 'posix':  # Linux, macOS, etc.
-            subprocess.call(('open', filepath))
-        elif os.name == 'nt':  # Windows
-            os.startfile(filepath)
-        else:
-            print(f"Cannot open file on this operating system: {filepath}")
-    else:
-        print(f"File does not exist: {filepath}")
-def list_folders_files_recursive(path, depth=0):
-    """
-    Recursively lists all folders and files within the specified path, including subfolders.
-    Parameters:
-    - path: The directory path to list contents from.
-    - depth: The current depth of recursion (used for indentation in print statements).
-    Returns:
-    - None
-    """
-    # Ensure the provided path is a directory
-    if not os.path.isdir(path):
-        print(f"The provided path '{path}' is not a valid directory.")
-        return
-    indent = '  ' * depth  # Indentation based on recursion depth
-    folders, files = [], []
-    # List all entries in the directory
-    for entry in os.listdir(path):
-        full_path = os.path.join(path, entry)
-        if os.path.isdir(full_path):
-            folders.append(entry)
-            print(f"{indent}Folder: {entry}")
-            # Recursively list subfolders and files
-            list_folders_files_recursive(full_path, depth + 1)
-        elif os.path.isfile(full_path):
-            files.append(entry)
-    for f in files:
-        print(f"{indent}File: {f}")
-def list_folders_files(path):
-    """
-    Lists all folders and files within the specified path.
-    Parameters:
-    - path: The directory path to list contents from.
-    Returns:
-    - A tuple of two lists: (folders, files).
-    """
-    folders = []
-    files = []
-    # Ensure the provided path is a directory
-    if not os.path.isdir(path):
-        print(f"The provided path '{path}' is not a valid directory.")
-        return folders, files
-    # List all entries in the directory
-    for entry in os.listdir(path):
-        full_path = os.path.join(path, entry)
-        if os.path.isdir(full_path):
-            folders.append(entry)
-        elif os.path.isfile(full_path):
-            files.append(entry)
-    return folders, files
-if __name__ == "__main__":
-    print("here are all functions that read files")

+# MUSS AUFGERÄUMT WERDEN
+import json
+import os
+import subprocess
+import PyPDF2
+import csv
+import fitz  # PyMuPDF
+def extract_text_from_pdf(pdf_path):
+    """
+    Extracts all text from a PDF file.
+    :param pdf_path: Path to the PDF file.
+    :return: Extracted text as a string.
+    """
+    # Open the PDF file
+    doc = fitz.open(pdf_path)
+    # Initialize an empty string to hold the text
+    text = ''
+    # Iterate through each page in the PDF
+    for page_num in range(len(doc)):
+        # Get a page
+        page = doc.load_page(page_num)
+        # Extract text from the page and add it to the result
+        text += page.get_text()
+    # Close the document
+    doc.close()
+    return text
+def read_pdfs_from_folder(folder_path):
+    """
+    Reads all PDF files in the specified folder using PdfReader and extracts their text.
+    Parameters:
+    - folder_path: The path to the folder containing PDF files.
+    Returns:
+    - A dictionary with file names as keys and their extracted text as values.
+    """
+    pdf_texts = {}
+    for filename in os.listdir(folder_path):
+        if filename.endswith('.pdf'):
+            file_path = os.path.join(folder_path, filename)
+            with open(file_path, 'rb') as pdf_file:
+                pdf_reader = PyPDF2.PdfReader(pdf_file)
+                text = ''
+                for page in pdf_reader.pages:
+                    try:
+                        text += page.extract_text()
+                    except UnicodeDecodeError as e:
+                        print(e)
+                for c in text:
+                    if c in ["ä", "Ä"]:
+                        text = text[:text.index(c)] + "ae" + text[text.index(c)+1:]
+                    if c in ["ö", "Ö"]:
+                        text = text[:text.index(c)] + "oe" + text[text.index(c)+1:]
+                    if c in ["ü", "Ü"]:
+                        text = text[:text.index(c)] + "ue" + text[text.index(c)+1:]
+                    if c in [",", ";", "\\", '"']:
+                        text = text[:text.index(c)] + "_" + text[text.index(c)+1:]
+                    if c in ["/n", "\n"]:
+                        text = text[:text.index(c)] + "<newline>" + text[text.index(c) + 1:]
+                pdf_texts[filename] = text
+    return pdf_texts
+def read_csv_lines_as_strings(filename):
+    """
+    Opens a CSV file and returns each line as a string in a list.
+    Parameters:
+    - filename: The path to the CSV file.
+    Returns:
+    - A list of strings, each representing a line from the CSV file.
+    """
+    lines_as_strings = []
+    with open(filename, newline='') as csvfile:
+        try:
+            reader = csv.reader(csvfile)
+            for row in reader:
+                # Convert the row (a list of values) back into a comma-separated string
+                line_as_string = ','.join(row)
+                lines_as_strings.append(line_as_string)
+        except UnicodeDecodeError as e:
+            print(e)
+    return lines_as_strings
+# Function to load data from JSON files
+def load_data(filename):
+    with open(filename, 'r') as file:
+        try:
+            return json.load(file)
+        except UnicodeDecodeError as err:
+            print(err)
+            return {}
+def find_and_open_file(filename, start_directory):
+    """
+    Attempts to open a file with the given filename starting from the specified directory.
+    If the file is not found, searches recursively in all subfolders. Works across macOS, Linux, and Windows.
+    """
+    for root, dirs, files in os.walk(start_directory):
+        if filename in files:
+            filepath = os.path.join(root, filename)
+            print(f"File found: {filepath}")
+            return filepath
+    print(f"File {filename} not found.")
+    return None
+def open_file(filepath):
+    """
+    Opens the file with the default application, based on the operating system.
+    """
+    if os.path.exists(filepath):
+        if os.name == 'posix':  # Linux, macOS, etc.
+            subprocess.call(('open', filepath))
+        elif os.name == 'nt':  # Windows
+            os.startfile(filepath)
+        else:
+            print(f"Cannot open file on this operating system: {filepath}")
+    else:
+        print(f"File does not exist: {filepath}")
+def list_folders_files_recursive(path, depth=0):
+    """
+    Recursively lists all folders and files within the specified path, including subfolders.
+    Parameters:
+    - path: The directory path to list contents from.
+    - depth: The current depth of recursion (used for indentation in print statements).
+    Returns:
+    - None
+    """
+    # Ensure the provided path is a directory
+    if not os.path.isdir(path):
+        print(f"The provided path '{path}' is not a valid directory.")
+        return
+    indent = '  ' * depth  # Indentation based on recursion depth
+    folders, files = [], []
+    # List all entries in the directory
+    for entry in os.listdir(path):
+        full_path = os.path.join(path, entry)
+        if os.path.isdir(full_path):
+            folders.append(entry)
+            print(f"{indent}Folder: {entry}")
+            # Recursively list subfolders and files
+            list_folders_files_recursive(full_path, depth + 1)
+        elif os.path.isfile(full_path):
+            files.append(entry)
+    for f in files:
+        print(f"{indent}File: {f}")
+def list_folders_files(path):
+    """
+    Lists all folders and files within the specified path.
+    Parameters:
+    - path: The directory path to list contents from.
+    Returns:
+    - A tuple of two lists: (folders, files).
+    """
+    folders = []
+    files = []
+    # Ensure the provided path is a directory
+    if not os.path.isdir(path):
+        print(f"The provided path '{path}' is not a valid directory.")
+        return folders, files
+    # List all entries in the directory
+    for entry in os.listdir(path):
+        full_path = os.path.join(path, entry)
+        if os.path.isdir(full_path):
+            folders.append(entry)
+        elif os.path.isfile(full_path):
+            files.append(entry)
+    return folders, files
+if __name__ == "__main__":
+    print("here are all functions that read files")

my_1_writer.py → util/my_1_writer.py RENAMED Viewed

File without changes

my_2_sim_search.py → util/my_2_sim_search.py RENAMED Viewed

@@ -1,163 +1,163 @@
-import my_new_openai
-import my_1_writer
-import json
-import numpy as np
-# sim search with dot_product and lin_distance
-# the newly vectorized TERM will be added to the database
-# database = .json file
-def sim_search_load_db(database, term, add_to_db=True, debug=False):
-    if type(term) == str:
-        print("str")
-        vector1 = my_new_openai.vectorize_data(term)
-    elif type(term) == list:
-        print("list")
-        vector1 = term
-    else:
-        print("invalid search_term/search_vector format")
-        return
-    with open(database, "r") as f:
-        table = json.load(f)
-    sim_search_dict = {}
-    for key in table.keys():
-        vector2 = table[key]
-        if debug:
-            print("")
-            print(f"{vector1}")
-            print(f"{vector2}")
-            print(f"doing dot product for {key} and {term}")
-        dp = np.dot(vector1, vector2)
-        distance = np.linalg.norm(np.array(vector1) - np.array(vector2))
-        if debug:
-            print(f"the dp is {dp}")
-            print(f"the distance is{distance}")
-            print("")
-            print("")
-            print("")
-        sim_search_dict[key] = dp * distance
-    # sort with the biggest similarity
-    sorted_table = dict(sorted(sim_search_dict.items(), key=lambda item: item[1]), reversed=True)
-    if debug:
-        for key, value in sorted_table[:5]:
-            print(f"{key}: {value}")
-    if add_to_db:
-        if term in table.keys():
-            print("the search term is in the database!")
-            # add the newly vectorized term to the words, if not already in the vector table
-        else:
-            if database != "session/my_words_vec_table.json":
-                database = "session/my_vecs.json"
-                # table = load_df(database)  # ??
-            table[str(term)] = vector1
-            my_1_writer.safe_my_dict_as_json(database, table)
-    # first_key, first_value = list(sortedTable.items())[0]
-    print(f"the closest word to your input is: {list(sorted_table.keys())[0]}")
-    return sorted_table
-def dot_p_to_1(database, vector1=0, analysis_filename=0):
-    with open(database, "r") as f:
-        table = json.load(f)
-    dot_product_to1 = {}
-    if vector1 == 0:
-        vector1 = [0.025515518153991442 for _ in range(1536)]
-    elif vector1 == 1:
-        vector1 = table[str(list(table.keys())[0])]
-    for key in table.keys():
-        dot_product_to1[key] = np.dot(vector1, table[key])
-    my_1_writer.safe_my_dict_as_json(analysis_filename, dot_product_to1)
-    print("dot p to 1 saved")
-def lin_dist(database, vector1=0, analysis_filename=0):
-    with open(database, "r") as f:
-        table = json.load(f)
-    lin_dist_to_1 = {}
-    if vector1 == 0:
-        vector1 = [0.025515518153991442 for _ in range(1536)]
-    elif vector1 == 1:
-        vector1 = table[str(list(table.keys())[0])]
-    for key in table.keys():
-        lin_dist_to_1[key] = np.linalg.norm(np.array(vector1) - np.array(table[key]))
-    my_1_writer.safe_my_dict_as_json(analysis_filename, lin_dist_to_1)
-    print("lin dist to 1 saved")
-def manhattan_dist(database, vector1=0, analysis_filename=0):
-    with open(database, "r") as f:
-        table = json.load(f)
-    manhattan_dist_to_1 = {}
-    if vector1 == 0:
-        vector1 = [0.025515518153991442 for _ in range(1536)]
-    elif vector1 == 1:
-        vector1 = table[str(list(table.keys())[0])]
-    for key in table.keys():
-        manhattan_dist_to_1[key] = sum(np.array(vector1) - np.array(table[key]))
-    my_1_writer.safe_my_dict_as_json(analysis_filename, manhattan_dist_to_1)
-    print("manhattan dist to 1 saved")
-#vec_table
-def sim_search_fly(vec_table, term, debug=False):
-    if debug:
-        print(type(vec_table))
-        print(type(term))
-        print(type(vec_table[list(vec_table.keys())[0]]))
-        print("vec table:")
-    print(vec_table[list(vec_table.keys())[5]][:4])
-    print("search term")
-    print(term[:4])
-    if type(term) == str:
-        print("str")
-        vector1 = my_new_openai.vectorize_data(term)
-    elif type(term) == list:
-        print("list")
-        vector1 = term
-    else:
-        print("invalid search_term/search_vector format")
-        return
-    sim_search_dict = {}
-    for key in vec_table.keys():
-        vector2 = vec_table[key]
-        if debug:
-            print("")
-            print(f"{vector1}")
-            print(f"{vector2}")
-            print(f"doing dot product for {key} and {term}")
-        if vector2[0] == vector2[1] and vector2[3] == vector2[4] and vector2[5] == vector2[6]:
-            dp = 200
-        else:
-            dp = np.dot(vector1, vector2)
-        #distance = np.linalg.norm(np.array(vector1) - np.array(vector2))
-        if debug:
-            print(f"the dp is {dp}")
-            #print(f"the distance is{distance}")
-            print("")
-            print("")
-            print("")
-        sim_search_dict[key] = dp #* distance
-    # sort with the biggest similarity
-    sorted_table = dict(sorted(sim_search_dict.items(), key=lambda item: item[1]), reversed=True)
-    if debug:
-        for key, value in sorted_table[:5]:
-            print(f"{key}: {value}")
-    # first_key, first_value = list(sortedTable.items())[0]
-    print(f"the closest word to your input is: {list(sorted_table.keys())[0]}")
-    return sorted_table

+import my_new_openai
+import my_1_writer
+import json
+import numpy as np
+# sim search with dot_product and lin_distance
+# the newly vectorized TERM will be added to the database
+# database = .json file
+def sim_search_load_db(database, term, add_to_db=True, debug=False):
+    if type(term) == str:
+        print("str")
+        vector1 = my_new_openai.vectorize_data(term)
+    elif type(term) == list:
+        print("list")
+        vector1 = term
+    else:
+        print("invalid search_term/search_vector format")
+        return
+    with open(database, "r") as f:
+        table = json.load(f)
+    sim_search_dict = {}
+    for key in table.keys():
+        vector2 = table[key]
+        if debug:
+            print("")
+            print(f"{vector1}")
+            print(f"{vector2}")
+            print(f"doing dot product for {key} and {term}")
+        dp = np.dot(vector1, vector2)
+        distance = np.linalg.norm(np.array(vector1) - np.array(vector2))
+        if debug:
+            print(f"the dp is {dp}")
+            print(f"the distance is{distance}")
+            print("")
+            print("")
+            print("")
+        sim_search_dict[key] = dp * distance
+    # sort with the biggest similarity
+    sorted_table = dict(sorted(sim_search_dict.items(), key=lambda item: item[1]), reversed=True)
+    if debug:
+        for key, value in sorted_table[:5]:
+            print(f"{key}: {value}")
+    if add_to_db:
+        if term in table.keys():
+            print("the search term is in the database!")
+            # add the newly vectorized term to the words, if not already in the vector table
+        else:
+            if database != "session/my_words_vec_table.json":
+                database = "session/my_vecs.json"
+                # table = load_df(database)  # ??
+            table[str(term)] = vector1
+            my_1_writer.safe_my_dict_as_json(database, table)
+    # first_key, first_value = list(sortedTable.items())[0]
+    print(f"the closest word to your input is: {list(sorted_table.keys())[0]}")
+    return sorted_table
+def dot_p_to_1(database, vector1=0, analysis_filename=0):
+    with open(database, "r") as f:
+        table = json.load(f)
+    dot_product_to1 = {}
+    if vector1 == 0:
+        vector1 = [0.025515518153991442 for _ in range(1536)]
+    elif vector1 == 1:
+        vector1 = table[str(list(table.keys())[0])]
+    for key in table.keys():
+        dot_product_to1[key] = np.dot(vector1, table[key])
+    my_1_writer.safe_my_dict_as_json(analysis_filename, dot_product_to1)
+    print("dot p to 1 saved")
+def lin_dist(database, vector1=0, analysis_filename=0):
+    with open(database, "r") as f:
+        table = json.load(f)
+    lin_dist_to_1 = {}
+    if vector1 == 0:
+        vector1 = [0.025515518153991442 for _ in range(1536)]
+    elif vector1 == 1:
+        vector1 = table[str(list(table.keys())[0])]
+    for key in table.keys():
+        lin_dist_to_1[key] = np.linalg.norm(np.array(vector1) - np.array(table[key]))
+    my_1_writer.safe_my_dict_as_json(analysis_filename, lin_dist_to_1)
+    print("lin dist to 1 saved")
+def manhattan_dist(database, vector1=0, analysis_filename=0):
+    with open(database, "r") as f:
+        table = json.load(f)
+    manhattan_dist_to_1 = {}
+    if vector1 == 0:
+        vector1 = [0.025515518153991442 for _ in range(1536)]
+    elif vector1 == 1:
+        vector1 = table[str(list(table.keys())[0])]
+    for key in table.keys():
+        manhattan_dist_to_1[key] = sum(np.array(vector1) - np.array(table[key]))
+    my_1_writer.safe_my_dict_as_json(analysis_filename, manhattan_dist_to_1)
+    print("manhattan dist to 1 saved")
+#vec_table
+def sim_search_fly(vec_table, term, debug=False):
+    if debug:
+        print(type(vec_table))
+        print(type(term))
+        print(type(vec_table[list(vec_table.keys())[0]]))
+        print("vec table:")
+    print(vec_table[list(vec_table.keys())[5]][:4])
+    print("search term")
+    print(term[:4])
+    if type(term) == str:
+        print("str")
+        vector1 = my_new_openai.vectorize_data(term)
+    elif type(term) == list:
+        print("list")
+        vector1 = term
+    else:
+        print("invalid search_term/search_vector format")
+        return
+    sim_search_dict = {}
+    for key in vec_table.keys():
+        vector2 = vec_table[key]
+        if debug:
+            print("")
+            print(f"{vector1}")
+            print(f"{vector2}")
+            print(f"doing dot product for {key} and {term}")
+        if vector2[0] == vector2[1] and vector2[3] == vector2[4] and vector2[5] == vector2[6]:
+            dp = 200
+        else:
+            dp = np.dot(vector1, vector2)
+        #distance = np.linalg.norm(np.array(vector1) - np.array(vector2))
+        if debug:
+            print(f"the dp is {dp}")
+            #print(f"the distance is{distance}")
+            print("")
+            print("")
+            print("")
+        sim_search_dict[key] = dp #* distance
+    # sort with the biggest similarity
+    sorted_table = dict(sorted(sim_search_dict.items(), key=lambda item: item[1]), reversed=True)
+    if debug:
+        for key, value in sorted_table[:5]:
+            print(f"{key}: {value}")
+    # first_key, first_value = list(sortedTable.items())[0]
+    print(f"the closest word to your input is: {list(sorted_table.keys())[0]}")
+    return sorted_table

my_new_openai.py → util/my_new_openai.py RENAMED Viewed

@@ -1,151 +1,151 @@
-import os
-from openai import OpenAI
-import requests
-import base64
-client = OpenAI()
-def image_bytes_to_base64(image_bytes):
-    """
-    Converts an image from bytes to a Base64 encoded string.
-    Args:
-    image_bytes (bytes): Byte content of the image.
-    Returns:
-    str: A Base64 encoded string of the image.
-    """
-    return base64.b64encode(image_bytes).decode('utf-8')
-def image_to_base64(image_path):
-    with open(image_path, "rb") as image_file:
-        return str(base64.b64encode(image_file.read()).decode('utf-8'))
-def gpt4_new(prompt_text):
-    gpt_response = client.chat.completions.create(
-        model="gpt-4",
-        messages=[{"role": "system",
-                   "content":   "Du bist eine Maschine, die Dokumente klassifiziert."},
-                  {"role": "user", "content": prompt_text}])
-    return gpt_response.choices[0].message.content
-def vectorize_data(data_input):
-    # input can be list or string:
-    if isinstance(data_input, list):
-        # returning a dictionary
-        my_dict = {}
-        for item in data_input:
-            my_dict[str(item)] = client.embeddings.create(input=data_input,
-                                                          model="text-embedding-ada-002").data[0].embedding
-        return my_dict
-    elif isinstance(data_input, str):
-        # returning just the vector
-        return client.embeddings.create(input=data_input, model="text-embedding-ada-002").data[0].embedding
-    else:
-        print("none")
-def img_create(prompt="a nice house on the beach", download_path=""):
-    # to open, must download
-    my_url = client.images.generate(model="dall-e-3", prompt=prompt, size="1024x1024").data[0].url
-    if download_path:
-        my_image = requests.get(my_url)
-        if my_image.status_code == 200:
-            with open(download_path, 'wb') as f:
-                f.write(my_image.content)
-        else:
-            print("Failed to retrieve image")
-    return my_url
-def img_to_text(img_url="", img_base64="", prompt="What’s in this image?", print_out=True):
-    if img_url:
-        img_desc_response = client.chat.completions.create(
-            model="gpt-4-turbo",
-            messages=[
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "text", "text": prompt},
-                        {
-                            "type": "image_url",
-                            "image_url": {
-                                "url": img_url,
-                            },
-                        },
-                    ],
-                }
-            ],
-            max_tokens=500,
-        )
-        if print_out:
-            print(img_desc_response.choices[0].message.content)
-        return img_desc_response.choices[0].message.content
-    elif img_base64:
-        headers = {
-            "Content-Type": "application/json",
-            "Authorization": f"Bearer {os.environ['OPENAI_API_KEY']}"
-        }
-        payload = {
-            "model": "gpt-4-turbo",
-            "messages": [
-                {
-                    "role": "user",
-                    "content": [
-                        {
-                            "type": "text",
-                            "text": prompt
-                        },
-                        {
-                            "type": "image_url",
-                            "image_url": {
-                                "url": f"data:image/jpeg;base64,{img_base64}"
-                            }
-                        }
-                    ]
-                }
-            ],
-            "max_tokens": 300
-        }
-        img_desc_response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
-        if print_out:
-            print(img_desc_response.json()["choices"][0]["message"]["content"])
-        return img_desc_response.json()["choices"][0]["message"]["content"]
-    else:
-        return ValueError
-def encode_image_to_base64(image_path):
-    with open(image_path, "rb") as image_file:
-        encoded_string = base64.b64encode(image_file.read()).decode('utf-8')
-    return encoded_string
-def table_to_text(table=None, prompt="describe this table in plain text. "
-                   "be as precise as possible. spare no detail. "
-                   "what is in this table?", print_out=True):
-    if table is not None:
-        response = gpt4_new(f"{prompt} TABLE: {table}")
-        if print_out:
-            print(response)
-        return response
-    else:
-        return ValueError
-if __name__ == "__main__":
-    #print("here are all functions that directly call openai.")
-    #img_create("a skier in the swiss alps", download_path="skier.png")
-    #img_to_text(img_base64=encode_image_to_base64("skier.png"))
-    #print(image_to_base64("skier.png"))
-    #print(vectorize_data("test string"))
-    print(gpt4_new())

+import os
+from openai import OpenAI
+import requests
+import base64
+client = OpenAI()
+def image_bytes_to_base64(image_bytes):
+    """
+    Converts an image from bytes to a Base64 encoded string.
+    Args:
+    image_bytes (bytes): Byte content of the image.
+    Returns:
+    str: A Base64 encoded string of the image.
+    """
+    return base64.b64encode(image_bytes).decode('utf-8')
+def image_to_base64(image_path):
+    with open(image_path, "rb") as image_file:
+        return str(base64.b64encode(image_file.read()).decode('utf-8'))
+def gpt4_new(prompt_text):
+    gpt_response = client.chat.completions.create(
+        model="gpt-4",
+        messages=[{"role": "system",
+                   "content":   "Du bist eine Maschine, die Dokumente klassifiziert."},
+                  {"role": "user", "content": prompt_text}])
+    return gpt_response.choices[0].message.content
+def vectorize_data(data_input):
+    # input can be list or string:
+    if isinstance(data_input, list):
+        # returning a dictionary
+        my_dict = {}
+        for item in data_input:
+            my_dict[str(item)] = client.embeddings.create(input=data_input,
+                                                          model="text-embedding-ada-002").data[0].embedding
+        return my_dict
+    elif isinstance(data_input, str):
+        # returning just the vector
+        return client.embeddings.create(input=data_input, model="text-embedding-ada-002").data[0].embedding
+    else:
+        print("none")
+def img_create(prompt="a nice house on the beach", download_path=""):
+    # to open, must download
+    my_url = client.images.generate(model="dall-e-3", prompt=prompt, size="1024x1024").data[0].url
+    if download_path:
+        my_image = requests.get(my_url)
+        if my_image.status_code == 200:
+            with open(download_path, 'wb') as f:
+                f.write(my_image.content)
+        else:
+            print("Failed to retrieve image")
+    return my_url
+def img_to_text(img_url="", img_base64="", prompt="What’s in this image?", print_out=True):
+    if img_url:
+        img_desc_response = client.chat.completions.create(
+            model="gpt-4-turbo",
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": prompt},
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": img_url,
+                            },
+                        },
+                    ],
+                }
+            ],
+            max_tokens=500,
+        )
+        if print_out:
+            print(img_desc_response.choices[0].message.content)
+        return img_desc_response.choices[0].message.content
+    elif img_base64:
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {os.environ['OPENAI_API_KEY']}"
+        }
+        payload = {
+            "model": "gpt-4-turbo",
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": prompt
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/jpeg;base64,{img_base64}"
+                            }
+                        }
+                    ]
+                }
+            ],
+            "max_tokens": 300
+        }
+        img_desc_response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
+        if print_out:
+            print(img_desc_response.json()["choices"][0]["message"]["content"])
+        return img_desc_response.json()["choices"][0]["message"]["content"]
+    else:
+        return ValueError
+def encode_image_to_base64(image_path):
+    with open(image_path, "rb") as image_file:
+        encoded_string = base64.b64encode(image_file.read()).decode('utf-8')
+    return encoded_string
+def table_to_text(table=None, prompt="describe this table in plain text. "
+                   "be as precise as possible. spare no detail. "
+                   "what is in this table?", print_out=True):
+    if table is not None:
+        response = gpt4_new(f"{prompt} TABLE: {table}")
+        if print_out:
+            print(response)
+        return response
+    else:
+        return ValueError
+if __name__ == "__main__":
+    #print("here are all functions that directly call openai.")
+    #img_create("a skier in the swiss alps", download_path="skier.png")
+    #img_to_text(img_base64=encode_image_to_base64("skier.png"))
+    #print(image_to_base64("skier.png"))
+    #print(vectorize_data("test string"))
+    print(gpt4_new())

my_vectors.py → util/my_vectors.py RENAMED Viewed

File without changes

setup_db.py → util/setup_db.py RENAMED Viewed

@@ -1,50 +1,50 @@
-import time
-import openpyxl
-import my_new_openai
-def update_excel_with_sums(filename):
-    # Load the workbook and select the active worksheet
-    workbook = openpyxl.load_workbook(filename)
-    sheet = workbook.active
-    # Iterate through each row in the sheet
-    for row in sheet.iter_rows(min_row=1, min_col=2, max_col=3):
-        Bn, Cn = row  # Assuming B and C are columns 2 and 3 respectively
-        vector = my_new_openai.vectorize_data(f"{Bn.value}: {Cn.value}") if Bn.value and Cn.value else 0
-        if vector != 0:
-            for val in vector:
-                sheet.cell(row=Bn.row, column=4+vector.index(val)).value = val
-    # Save the workbook
-    workbook.save(filename)
-    print(f"Updated the file '{filename}' with vectors in column D.")
-def load_vectorstore_from_excel(filename):
-    # returns a dictonary
-    # Load the workbook and select the active worksheet
-    workbook = openpyxl.load_workbook(filename)
-    sheet = workbook.active
-    # Iterate through each row in the sheet
-    vec_store = {}
-    for row in range(3, 634):
-        vec = []
-        for col in range(0, 1536):
-            val = sheet.cell(row=row, column=4+col).value
-            vec.append(val)
-        vec_store[str(sheet.cell(row=row, column=1).value)] = vec
-    return vec_store
-if __name__ == '__main__':
-    #update_excel_with_sums("KBOB_Klassifizierung.xlsx")
-    t = time.time()
-    vec_store = load_vectorstore_from_excel("KBOB_Klassifizierung.xlsx")
-    print(time.time()-t)
-    for e in vec_store.keys():
-        print(f"{e}: {vec_store[e][0]}, {vec_store[e][1]}, .... {vec_store[e][-1]}")

+import time
+import openpyxl
+import my_new_openai
+def update_excel_with_sums(filename):
+    # Load the workbook and select the active worksheet
+    workbook = openpyxl.load_workbook(filename)
+    sheet = workbook.active
+    # Iterate through each row in the sheet
+    for row in sheet.iter_rows(min_row=1, min_col=2, max_col=3):
+        Bn, Cn = row  # Assuming B and C are columns 2 and 3 respectively
+        vector = my_new_openai.vectorize_data(f"{Bn.value}: {Cn.value}") if Bn.value and Cn.value else 0
+        if vector != 0:
+            for val in vector:
+                sheet.cell(row=Bn.row, column=4+vector.index(val)).value = val
+    # Save the workbook
+    workbook.save(filename)
+    print(f"Updated the file '{filename}' with vectors in column D.")
+def load_vectorstore_from_excel(filename):
+    # returns a dictonary
+    # Load the workbook and select the active worksheet
+    workbook = openpyxl.load_workbook(filename)
+    sheet = workbook.active
+    # Iterate through each row in the sheet
+    vec_store = {}
+    for row in range(3, 634):
+        vec = []
+        for col in range(0, 1536):
+            val = sheet.cell(row=row, column=4+col).value
+            vec.append(val)
+        vec_store[str(sheet.cell(row=row, column=1).value)] = vec
+    return vec_store
+if __name__ == '__main__':
+    #update_excel_with_sums("KBOB_Klassifizierung.xlsx")
+    t = time.time()
+    vec_store = load_vectorstore_from_excel("../data/KBOB_Klassifizierung.xlsx")
+    print(time.time()-t)
+    for e in vec_store.keys():
+        print(f"{e}: {vec_store[e][0]}, {vec_store[e][1]}, .... {vec_store[e][-1]}")