elia-waefler commited on
Commit
0c4e36a
·
1 Parent(s): 6bccf6f
.gitignore CHANGED
@@ -120,9 +120,7 @@ celerybeat.pid
120
 
121
  # Environments
122
  .env
123
- .venv
124
  env/
125
- venv/
126
  ENV/
127
  env.bak/
128
  venv.bak/
 
120
 
121
  # Environments
122
  .env
 
123
  env/
 
124
  ENV/
125
  env.bak/
126
  venv.bak/
LICENSE CHANGED
@@ -1,201 +1,201 @@
1
- Apache License
2
- Version 2.0, January 2004
3
- http://www.apache.org/licenses/
4
-
5
- TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
-
7
- 1. Definitions.
8
-
9
- "License" shall mean the terms and conditions for use, reproduction,
10
- and distribution as defined by Sections 1 through 9 of this document.
11
-
12
- "Licensor" shall mean the copyright owner or entity authorized by
13
- the copyright owner that is granting the License.
14
-
15
- "Legal Entity" shall mean the union of the acting entity and all
16
- other entities that control, are controlled by, or are under common
17
- control with that entity. For the purposes of this definition,
18
- "control" means (i) the power, direct or indirect, to cause the
19
- direction or management of such entity, whether by contract or
20
- otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
- outstanding shares, or (iii) beneficial ownership of such entity.
22
-
23
- "You" (or "Your") shall mean an individual or Legal Entity
24
- exercising permissions granted by this License.
25
-
26
- "Source" form shall mean the preferred form for making modifications,
27
- including but not limited to software source code, documentation
28
- source, and configuration files.
29
-
30
- "Object" form shall mean any form resulting from mechanical
31
- transformation or translation of a Source form, including but
32
- not limited to compiled object code, generated documentation,
33
- and conversions to other media types.
34
-
35
- "Work" shall mean the work of authorship, whether in Source or
36
- Object form, made available under the License, as indicated by a
37
- copyright notice that is included in or attached to the work
38
- (an example is provided in the Appendix below).
39
-
40
- "Derivative Works" shall mean any work, whether in Source or Object
41
- form, that is based on (or derived from) the Work and for which the
42
- editorial revisions, annotations, elaborations, or other modifications
43
- represent, as a whole, an original work of authorship. For the purposes
44
- of this License, Derivative Works shall not include works that remain
45
- separable from, or merely link (or bind by name) to the interfaces of,
46
- the Work and Derivative Works thereof.
47
-
48
- "Contribution" shall mean any work of authorship, including
49
- the original version of the Work and any modifications or additions
50
- to that Work or Derivative Works thereof, that is intentionally
51
- submitted to Licensor for inclusion in the Work by the copyright owner
52
- or by an individual or Legal Entity authorized to submit on behalf of
53
- the copyright owner. For the purposes of this definition, "submitted"
54
- means any form of electronic, verbal, or written communication sent
55
- to the Licensor or its representatives, including but not limited to
56
- communication on electronic mailing lists, source code control systems,
57
- and issue tracking systems that are managed by, or on behalf of, the
58
- Licensor for the purpose of discussing and improving the Work, but
59
- excluding communication that is conspicuously marked or otherwise
60
- designated in writing by the copyright owner as "Not a Contribution."
61
-
62
- "Contributor" shall mean Licensor and any individual or Legal Entity
63
- on behalf of whom a Contribution has been received by Licensor and
64
- subsequently incorporated within the Work.
65
-
66
- 2. Grant of Copyright License. Subject to the terms and conditions of
67
- this License, each Contributor hereby grants to You a perpetual,
68
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
- copyright license to reproduce, prepare Derivative Works of,
70
- publicly display, publicly perform, sublicense, and distribute the
71
- Work and such Derivative Works in Source or Object form.
72
-
73
- 3. Grant of Patent License. Subject to the terms and conditions of
74
- this License, each Contributor hereby grants to You a perpetual,
75
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
- (except as stated in this section) patent license to make, have made,
77
- use, offer to sell, sell, import, and otherwise transfer the Work,
78
- where such license applies only to those patent claims licensable
79
- by such Contributor that are necessarily infringed by their
80
- Contribution(s) alone or by combination of their Contribution(s)
81
- with the Work to which such Contribution(s) was submitted. If You
82
- institute patent litigation against any entity (including a
83
- cross-claim or counterclaim in a lawsuit) alleging that the Work
84
- or a Contribution incorporated within the Work constitutes direct
85
- or contributory patent infringement, then any patent licenses
86
- granted to You under this License for that Work shall terminate
87
- as of the date such litigation is filed.
88
-
89
- 4. Redistribution. You may reproduce and distribute copies of the
90
- Work or Derivative Works thereof in any medium, with or without
91
- modifications, and in Source or Object form, provided that You
92
- meet the following conditions:
93
-
94
- (a) You must give any other recipients of the Work or
95
- Derivative Works a copy of this License; and
96
-
97
- (b) You must cause any modified files to carry prominent notices
98
- stating that You changed the files; and
99
-
100
- (c) You must retain, in the Source form of any Derivative Works
101
- that You distribute, all copyright, patent, trademark, and
102
- attribution notices from the Source form of the Work,
103
- excluding those notices that do not pertain to any part of
104
- the Derivative Works; and
105
-
106
- (d) If the Work includes a "NOTICE" text file as part of its
107
- distribution, then any Derivative Works that You distribute must
108
- include a readable copy of the attribution notices contained
109
- within such NOTICE file, excluding those notices that do not
110
- pertain to any part of the Derivative Works, in at least one
111
- of the following places: within a NOTICE text file distributed
112
- as part of the Derivative Works; within the Source form or
113
- documentation, if provided along with the Derivative Works; or,
114
- within a display generated by the Derivative Works, if and
115
- wherever such third-party notices normally appear. The contents
116
- of the NOTICE file are for informational purposes only and
117
- do not modify the License. You may add Your own attribution
118
- notices within Derivative Works that You distribute, alongside
119
- or as an addendum to the NOTICE text from the Work, provided
120
- that such additional attribution notices cannot be construed
121
- as modifying the License.
122
-
123
- You may add Your own copyright statement to Your modifications and
124
- may provide additional or different license terms and conditions
125
- for use, reproduction, or distribution of Your modifications, or
126
- for any such Derivative Works as a whole, provided Your use,
127
- reproduction, and distribution of the Work otherwise complies with
128
- the conditions stated in this License.
129
-
130
- 5. Submission of Contributions. Unless You explicitly state otherwise,
131
- any Contribution intentionally submitted for inclusion in the Work
132
- by You to the Licensor shall be under the terms and conditions of
133
- this License, without any additional terms or conditions.
134
- Notwithstanding the above, nothing herein shall supersede or modify
135
- the terms of any separate license agreement you may have executed
136
- with Licensor regarding such Contributions.
137
-
138
- 6. Trademarks. This License does not grant permission to use the trade
139
- names, trademarks, service marks, or product names of the Licensor,
140
- except as required for reasonable and customary use in describing the
141
- origin of the Work and reproducing the content of the NOTICE file.
142
-
143
- 7. Disclaimer of Warranty. Unless required by applicable law or
144
- agreed to in writing, Licensor provides the Work (and each
145
- Contributor provides its Contributions) on an "AS IS" BASIS,
146
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
- implied, including, without limitation, any warranties or conditions
148
- of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
- PARTICULAR PURPOSE. You are solely responsible for determining the
150
- appropriateness of using or redistributing the Work and assume any
151
- risks associated with Your exercise of permissions under this License.
152
-
153
- 8. Limitation of Liability. In no event and under no legal theory,
154
- whether in tort (including negligence), contract, or otherwise,
155
- unless required by applicable law (such as deliberate and grossly
156
- negligent acts) or agreed to in writing, shall any Contributor be
157
- liable to You for damages, including any direct, indirect, special,
158
- incidental, or consequential damages of any character arising as a
159
- result of this License or out of the use or inability to use the
160
- Work (including but not limited to damages for loss of goodwill,
161
- work stoppage, computer failure or malfunction, or any and all
162
- other commercial damages or losses), even if such Contributor
163
- has been advised of the possibility of such damages.
164
-
165
- 9. Accepting Warranty or Additional Liability. While redistributing
166
- the Work or Derivative Works thereof, You may choose to offer,
167
- and charge a fee for, acceptance of support, warranty, indemnity,
168
- or other liability obligations and/or rights consistent with this
169
- License. However, in accepting such obligations, You may act only
170
- on Your own behalf and on Your sole responsibility, not on behalf
171
- of any other Contributor, and only if You agree to indemnify,
172
- defend, and hold each Contributor harmless for any liability
173
- incurred by, or claims asserted against, such Contributor by reason
174
- of your accepting any such warranty or additional liability.
175
-
176
- END OF TERMS AND CONDITIONS
177
-
178
- APPENDIX: How to apply the Apache License to your work.
179
-
180
- To apply the Apache License to your work, attach the following
181
- boilerplate notice, with the fields enclosed by brackets "[]"
182
- replaced with your own identifying information. (Don't include
183
- the brackets!) The text should be enclosed in the appropriate
184
- comment syntax for the file format. We also recommend that a
185
- file or class name and description of purpose be included on the
186
- same "printed page" as the copyright notice for easier
187
- identification within third-party archives.
188
-
189
- Copyright [yyyy] [name of copyright owner]
190
-
191
- Licensed under the Apache License, Version 2.0 (the "License");
192
- you may not use this file except in compliance with the License.
193
- You may obtain a copy of the License at
194
-
195
- http://www.apache.org/licenses/LICENSE-2.0
196
-
197
- Unless required by applicable law or agreed to in writing, software
198
- distributed under the License is distributed on an "AS IS" BASIS,
199
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
- See the License for the specific language governing permissions and
201
- limitations under the License.
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
app.py CHANGED
@@ -1,14 +1,10 @@
1
  """
2
  testing my own vectors
3
  """
4
- import ingest
5
- import my_2_sim_search
6
- import my_new_openai
7
- import setup_db
8
  import time
9
  import streamlit as st
10
  import os
11
- import my_vectors
12
 
13
 
14
  def merge_indices(index1, index2):
@@ -148,7 +144,7 @@ def main():
148
  st.warning("unsaved embeddings will be lost.")
149
  else:
150
  file = st.file_uploader("upload file", accept_multiple_files=False)
151
- vec_store = setup_db.load_vectorstore_from_excel("KBOB_Klassifizierung.xlsx")
152
  if st.button("classify me!"):
153
  with st.spinner("Classifying..."):
154
  query_vecs = []
 
1
  """
2
  testing my own vectors
3
  """
4
+ from util import setup_db, my_vectors, my_2_sim_search, ingest, my_new_openai
 
 
 
5
  import time
6
  import streamlit as st
7
  import os
 
8
 
9
 
10
  def merge_indices(index1, index2):
 
144
  st.warning("unsaved embeddings will be lost.")
145
  else:
146
  file = st.file_uploader("upload file", accept_multiple_files=False)
147
+ vec_store = setup_db.load_vectorstore_from_excel("data/KBOB_Klassifizierung.xlsx")
148
  if st.button("classify me!"):
149
  with st.spinner("Classifying..."):
150
  query_vecs = []
KBOB_Klassifizierung.xlsx → data/KBOB_Klassifizierung.xlsx RENAMED
File without changes
requirements.txt CHANGED
@@ -1,16 +1,16 @@
1
- streamlit~=1.33.0
2
- bcrypt~=4.1.2
3
- psycopg2-binary~=2.9.9
4
- openai~=1.23.2
5
- pypdf2~=3.0.1
6
- langchain~=0.1.16
7
- tiktoken~=0.6.0
8
- numpy~=1.26.4
9
- requests~=2.31.0
10
- pandas~=2.2.2
11
- tabula~=1.0.5
12
- pdfplumber~=0.11.0
13
- PyMuPDF~=1.24.3
14
- fitz~=0.0.1.dev2
15
- pillow~=10.3.0
16
  openpyxl~=3.1.2
 
1
+ streamlit~=1.33.0
2
+ bcrypt~=4.1.2
3
+ psycopg2-binary~=2.9.9
4
+ openai~=1.23.2
5
+ pypdf2~=3.0.1
6
+ langchain~=0.1.16
7
+ tiktoken~=0.6.0
8
+ numpy~=1.26.4
9
+ requests~=2.31.0
10
+ pandas~=2.2.2
11
+ tabula~=1.0.5
12
+ pdfplumber~=0.11.0
13
+ PyMuPDF~=1.24.3
14
+ fitz~=0.0.1.dev2
15
+ pillow~=10.3.0
16
  openpyxl~=3.1.2
ask_app.py → util/ask_app.py RENAMED
@@ -1,243 +1,243 @@
1
- """
2
- complete, functional RAG App
3
- stores vectors in session state, or locally.
4
- add function to display retrieved documents
5
- """
6
-
7
- # import time
8
- from datetime import datetime
9
- # import openai
10
- # import tiktoken
11
- import streamlit as st
12
- from PyPDF2 import PdfReader
13
- from langchain.text_splitter import CharacterTextSplitter
14
- from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
15
- from langchain.vectorstores import FAISS
16
- from langchain.chat_models import ChatOpenAI
17
- from langchain.memory import ConversationBufferMemory
18
- from langchain.chains import ConversationalRetrievalChain
19
- from html_templates import css, bot_template, user_template
20
- from langchain.llms import HuggingFaceHub
21
- import os
22
- import numpy as np
23
- import faiss_utils
24
- from langchain_community.vectorstores import FAISS
25
- from langchain.embeddings import OpenAIEmbeddings
26
-
27
-
28
- def merge_faiss_indices(index1, index2):
29
- """
30
- Merge two FAISS indices into a new index, assuming both are of the same type and dimensionality.
31
-
32
- Args:
33
- index1 (faiss.Index): The first FAISS index.
34
- index2 (faiss.Index): The second FAISS index.
35
-
36
- Returns:
37
- faiss.Index: A new FAISS index containing all vectors from index1 and index2.
38
- """
39
-
40
- # Check if both indices are the same type
41
- if type(index1) != type(index2):
42
- raise ValueError("Indices are of different types")
43
-
44
- # Check dimensionality
45
- if index1.d != index2.d:
46
- raise ValueError("Indices have different dimensionality")
47
-
48
- # Determine type of indices
49
- if isinstance(index1, FAISS.IndexFlatL2):
50
- # Handle simple flat indices
51
- d = index1.d
52
- # Extract vectors from both indices
53
- xb1 = FAISS.rev_swig_ptr(index1.xb.data(), index1.ntotal * d)
54
- xb2 = FAISS.rev_swig_ptr(index2.xb.data(), index2.ntotal * d)
55
-
56
- # Combine vectors
57
- xb_combined = np.vstack((xb1, xb2))
58
-
59
- # Create a new index and add combined vectors
60
- new_index = FAISS.IndexFlatL2(d)
61
- new_index.add(xb_combined)
62
- return new_index
63
-
64
- elif isinstance(index1, FAISS.IndexIVFFlat):
65
- # Handle quantized indices (IndexIVFFlat)
66
- d = index1.d
67
- nlist = index1.nlist
68
- quantizer = FAISS.IndexFlatL2(d) # Re-create the appropriate quantizer
69
-
70
- # Create a new index with the same configuration
71
- new_index = FAISS.IndexIVFFlat(quantizer, d, nlist, FAISS.METRIC_L2)
72
-
73
- # If the indices are already trained, you can directly add the vectors
74
- # Otherwise, you may need to train new_index using a representative subset of vectors
75
- vecs1 = FAISS.rev_swig_ptr(index1.xb.data(), index1.ntotal * d)
76
- vecs2 = FAISS.rev_swig_ptr(index2.xb.data(), index2.ntotal * d)
77
- new_index.add(vecs1)
78
- new_index.add(vecs2)
79
- return new_index
80
-
81
- else:
82
- raise TypeError("Index type not supported for merging in this function")
83
-
84
-
85
- def get_pdf_text(pdf_docs):
86
- text = ""
87
- for pdf in pdf_docs:
88
- pdf_reader = PdfReader(pdf)
89
- for page in pdf_reader.pages:
90
- text += page.extract_text()
91
- return text
92
-
93
-
94
- def get_text_chunks(text):
95
- text_splitter = CharacterTextSplitter(
96
- separator="\n",
97
- chunk_size=1000,
98
- chunk_overlap=200,
99
- length_function=len
100
- )
101
- chunks = text_splitter.split_text(text)
102
- return chunks
103
-
104
-
105
- def get_faiss_vectorstore(text_chunks):
106
- if sst.openai:
107
- my_embeddings = OpenAIEmbeddings()
108
- else:
109
- my_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
110
- vectorstore = FAISS.from_texts(texts=text_chunks, embedding=my_embeddings)
111
- return vectorstore
112
-
113
-
114
- def get_conversation_chain(vectorstore):
115
- if sst.openai:
116
- llm = ChatOpenAI()
117
- else:
118
- llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature": 0.5, "max_length": 512})
119
-
120
- memory = ConversationBufferMemory(
121
- memory_key='chat_history', return_messages=True)
122
- conversation_chain = ConversationalRetrievalChain.from_llm(
123
- llm=llm,
124
- retriever=vectorstore.as_retriever(),
125
- memory=memory
126
- )
127
- return conversation_chain
128
-
129
-
130
- def handle_userinput(user_question):
131
- response = sst.conversation({'question': user_question})
132
- sst.chat_history = response['chat_history']
133
-
134
- for i, message in enumerate(sst.chat_history):
135
- # Display user message
136
- if i % 2 == 0:
137
- st.write(user_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
138
- else:
139
- print(message)
140
- # Display AI response
141
- st.write(bot_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
142
- # Display source document information if available in the message
143
- if hasattr(message, 'source') and message.source:
144
- st.write(f"Source Document: {message.source}", unsafe_allow_html=True)
145
-
146
-
147
- if True:
148
- BASE_URL = "https://api.vectara.io/v1"
149
- OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
150
- OPENAI_ORG_ID = os.environ["OPENAI_ORG_ID"]
151
- PINECONE_API_KEY = os.environ["PINECONE_API_KEY_LCBIM"]
152
- HUGGINGFACEHUB_API_TOKEN = os.environ["HUGGINGFACEHUB_API_TOKEN"]
153
- VECTARA_API_KEY = os.environ["VECTARA_API_KEY"]
154
- VECTARA_CUSTOMER_ID = os.environ["VECTARA_CUSTOMER_ID"]
155
- headers = {"Authorization": f"Bearer {VECTARA_API_KEY}", "Content-Type": "application/json"}
156
-
157
-
158
- def main():
159
- st.set_page_config(page_title="Anna Seiler Haus KI-Assistent", page_icon=":hospital:")
160
- st.write(css, unsafe_allow_html=True)
161
- if "conversation" not in sst:
162
- sst.conversation = None
163
- if "chat_history" not in sst:
164
- sst.chat_history = None
165
- if "page" not in sst:
166
- sst.page = "home"
167
- if "openai" not in sst:
168
- sst.openai = True
169
- if "login" not in sst:
170
- sst.login = False
171
- if 'submitted_user_query' not in sst:
172
- sst.submitted_user_query = ''
173
- if 'submitted_user_safe' not in sst:
174
- sst.submitted_user_safe = ''
175
- if 'submitted_user_load' not in sst:
176
- sst.submitted_user_load = ''
177
-
178
- def submit_user_query():
179
- sst.submitted_user_query = sst.widget_user_query
180
- sst.widget_user_query = ''
181
-
182
- def submit_user_safe():
183
- sst.submitted_user_safe = sst.widget_user_safe
184
- sst.widget_user_safe = ''
185
- if "vectorstore" in sst:
186
- # faiss_name = str(datetime.now().strftime("%Y%m%d%H%M%S")) + "faiss_index"
187
- faiss_utils.save_local(sst.vectorstore, path=sst.submitted_user_safe)
188
- st.sidebar.success("saved")
189
- else:
190
- st.sidebar.warning("No embeddings to save. Please process documents first.")
191
-
192
- def submit_user_load():
193
- sst.submitted_user_load = sst.widget_user_load
194
- sst.widget_user_load = ''
195
- if os.path.exists(sst.submitted_user_load):
196
- new_db = faiss_utils.load_vectorstore(f"{sst.submitted_user_load}/faiss_index.index")
197
- if "vectorstore" in sst:
198
- if new_db is not None: # Check if this is working
199
- sst.vectorstore.merge_from(new_db)
200
- sst.conversation = get_conversation_chain(sst.vectorstore)
201
- st.sidebar.success("faiss loaded")
202
- else:
203
- if new_db is not None: # Check if this is working
204
- sst.vectorstore = new_db
205
- sst.conversation = get_conversation_chain(new_db)
206
- st.sidebar.success("faiss loaded")
207
- else:
208
- st.sidebar.warning("Couldn't load/find embeddings")
209
-
210
- st.header("Anna Seiler Haus KI-Assistent ASH :hospital:")
211
- if st.text_input("ASK_ASH_PASSWORD: ", type="password") == ASK_ASH_PASSWORD:
212
-
213
- #user_question = st.text_input("Ask a question about your documents:", key="user_query", on_change=handle_query)
214
- st.text_input('Ask a question about your documents:', key='widget_user_query', on_change=submit_user_query)
215
- #sst.openai = st.toggle(label="use openai?")
216
-
217
- if sst.submitted_user_query:
218
- if "vectorstore" in sst:
219
- handle_userinput(sst.submitted_user_query)
220
- else:
221
- st.warning("no vectorstore loaded.")
222
-
223
- with st.sidebar:
224
- st.subheader("Your documents")
225
- pdf_docs = st.file_uploader("Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
226
- if st.button("Process"):
227
- with st.spinner("Processing"):
228
- vec = get_faiss_vectorstore(get_text_chunks(get_pdf_text(pdf_docs)))
229
- sst.vectorstore = vec
230
- sst.conversation = get_conversation_chain(vec)
231
- st.success("embedding complete")
232
-
233
- st.text_input('Safe Embeddings to: (copy path of folder)', key='widget_user_safe',
234
- on_change=submit_user_safe)
235
-
236
- st.text_input('Load Embeddings from: (copy path of folder)', key='widget_user_load',
237
- on_change=submit_user_load)
238
-
239
-
240
- if __name__ == '__main__':
241
- sst = st.session_state
242
- ASK_ASH_PASSWORD = os.environ["ASK_ASH_PASSWORD"]
243
- main()
 
1
+ """
2
+ complete, functional RAG App
3
+ stores vectors in session state, or locally.
4
+ add function to display retrieved documents
5
+ """
6
+
7
+ # import time
8
+ from datetime import datetime
9
+ # import openai
10
+ # import tiktoken
11
+ import streamlit as st
12
+ from PyPDF2 import PdfReader
13
+ from langchain.text_splitter import CharacterTextSplitter
14
+ from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
15
+ from langchain.vectorstores import FAISS
16
+ from langchain.chat_models import ChatOpenAI
17
+ from langchain.memory import ConversationBufferMemory
18
+ from langchain.chains import ConversationalRetrievalChain
19
+ from html_templates import css, bot_template, user_template
20
+ from langchain.llms import HuggingFaceHub
21
+ import os
22
+ import numpy as np
23
+ import faiss_utils
24
+ from langchain_community.vectorstores import FAISS
25
+ from langchain.embeddings import OpenAIEmbeddings
26
+
27
+
28
+ def merge_faiss_indices(index1, index2):
29
+ """
30
+ Merge two FAISS indices into a new index, assuming both are of the same type and dimensionality.
31
+
32
+ Args:
33
+ index1 (faiss.Index): The first FAISS index.
34
+ index2 (faiss.Index): The second FAISS index.
35
+
36
+ Returns:
37
+ faiss.Index: A new FAISS index containing all vectors from index1 and index2.
38
+ """
39
+
40
+ # Check if both indices are the same type
41
+ if type(index1) != type(index2):
42
+ raise ValueError("Indices are of different types")
43
+
44
+ # Check dimensionality
45
+ if index1.d != index2.d:
46
+ raise ValueError("Indices have different dimensionality")
47
+
48
+ # Determine type of indices
49
+ if isinstance(index1, FAISS.IndexFlatL2):
50
+ # Handle simple flat indices
51
+ d = index1.d
52
+ # Extract vectors from both indices
53
+ xb1 = FAISS.rev_swig_ptr(index1.xb.data(), index1.ntotal * d)
54
+ xb2 = FAISS.rev_swig_ptr(index2.xb.data(), index2.ntotal * d)
55
+
56
+ # Combine vectors
57
+ xb_combined = np.vstack((xb1, xb2))
58
+
59
+ # Create a new index and add combined vectors
60
+ new_index = FAISS.IndexFlatL2(d)
61
+ new_index.add(xb_combined)
62
+ return new_index
63
+
64
+ elif isinstance(index1, FAISS.IndexIVFFlat):
65
+ # Handle quantized indices (IndexIVFFlat)
66
+ d = index1.d
67
+ nlist = index1.nlist
68
+ quantizer = FAISS.IndexFlatL2(d) # Re-create the appropriate quantizer
69
+
70
+ # Create a new index with the same configuration
71
+ new_index = FAISS.IndexIVFFlat(quantizer, d, nlist, FAISS.METRIC_L2)
72
+
73
+ # If the indices are already trained, you can directly add the vectors
74
+ # Otherwise, you may need to train new_index using a representative subset of vectors
75
+ vecs1 = FAISS.rev_swig_ptr(index1.xb.data(), index1.ntotal * d)
76
+ vecs2 = FAISS.rev_swig_ptr(index2.xb.data(), index2.ntotal * d)
77
+ new_index.add(vecs1)
78
+ new_index.add(vecs2)
79
+ return new_index
80
+
81
+ else:
82
+ raise TypeError("Index type not supported for merging in this function")
83
+
84
+
85
+ def get_pdf_text(pdf_docs):
86
+ text = ""
87
+ for pdf in pdf_docs:
88
+ pdf_reader = PdfReader(pdf)
89
+ for page in pdf_reader.pages:
90
+ text += page.extract_text()
91
+ return text
92
+
93
+
94
+ def get_text_chunks(text):
95
+ text_splitter = CharacterTextSplitter(
96
+ separator="\n",
97
+ chunk_size=1000,
98
+ chunk_overlap=200,
99
+ length_function=len
100
+ )
101
+ chunks = text_splitter.split_text(text)
102
+ return chunks
103
+
104
+
105
+ def get_faiss_vectorstore(text_chunks):
106
+ if sst.openai:
107
+ my_embeddings = OpenAIEmbeddings()
108
+ else:
109
+ my_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
110
+ vectorstore = FAISS.from_texts(texts=text_chunks, embedding=my_embeddings)
111
+ return vectorstore
112
+
113
+
114
+ def get_conversation_chain(vectorstore):
115
+ if sst.openai:
116
+ llm = ChatOpenAI()
117
+ else:
118
+ llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature": 0.5, "max_length": 512})
119
+
120
+ memory = ConversationBufferMemory(
121
+ memory_key='chat_history', return_messages=True)
122
+ conversation_chain = ConversationalRetrievalChain.from_llm(
123
+ llm=llm,
124
+ retriever=vectorstore.as_retriever(),
125
+ memory=memory
126
+ )
127
+ return conversation_chain
128
+
129
+
130
+ def handle_userinput(user_question):
131
+ response = sst.conversation({'question': user_question})
132
+ sst.chat_history = response['chat_history']
133
+
134
+ for i, message in enumerate(sst.chat_history):
135
+ # Display user message
136
+ if i % 2 == 0:
137
+ st.write(user_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
138
+ else:
139
+ print(message)
140
+ # Display AI response
141
+ st.write(bot_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
142
+ # Display source document information if available in the message
143
+ if hasattr(message, 'source') and message.source:
144
+ st.write(f"Source Document: {message.source}", unsafe_allow_html=True)
145
+
146
+
147
+ if True:
148
+ BASE_URL = "https://api.vectara.io/v1"
149
+ OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
150
+ OPENAI_ORG_ID = os.environ["OPENAI_ORG_ID"]
151
+ PINECONE_API_KEY = os.environ["PINECONE_API_KEY_LCBIM"]
152
+ HUGGINGFACEHUB_API_TOKEN = os.environ["HUGGINGFACEHUB_API_TOKEN"]
153
+ VECTARA_API_KEY = os.environ["VECTARA_API_KEY"]
154
+ VECTARA_CUSTOMER_ID = os.environ["VECTARA_CUSTOMER_ID"]
155
+ headers = {"Authorization": f"Bearer {VECTARA_API_KEY}", "Content-Type": "application/json"}
156
+
157
+
158
+ def main():
159
+ st.set_page_config(page_title="Anna Seiler Haus KI-Assistent", page_icon=":hospital:")
160
+ st.write(css, unsafe_allow_html=True)
161
+ if "conversation" not in sst:
162
+ sst.conversation = None
163
+ if "chat_history" not in sst:
164
+ sst.chat_history = None
165
+ if "page" not in sst:
166
+ sst.page = "home"
167
+ if "openai" not in sst:
168
+ sst.openai = True
169
+ if "login" not in sst:
170
+ sst.login = False
171
+ if 'submitted_user_query' not in sst:
172
+ sst.submitted_user_query = ''
173
+ if 'submitted_user_safe' not in sst:
174
+ sst.submitted_user_safe = ''
175
+ if 'submitted_user_load' not in sst:
176
+ sst.submitted_user_load = ''
177
+
178
+ def submit_user_query():
179
+ sst.submitted_user_query = sst.widget_user_query
180
+ sst.widget_user_query = ''
181
+
182
+ def submit_user_safe():
183
+ sst.submitted_user_safe = sst.widget_user_safe
184
+ sst.widget_user_safe = ''
185
+ if "vectorstore" in sst:
186
+ # faiss_name = str(datetime.now().strftime("%Y%m%d%H%M%S")) + "faiss_index"
187
+ faiss_utils.save_local(sst.vectorstore, path=sst.submitted_user_safe)
188
+ st.sidebar.success("saved")
189
+ else:
190
+ st.sidebar.warning("No embeddings to save. Please process documents first.")
191
+
192
+ def submit_user_load():
193
+ sst.submitted_user_load = sst.widget_user_load
194
+ sst.widget_user_load = ''
195
+ if os.path.exists(sst.submitted_user_load):
196
+ new_db = faiss_utils.load_vectorstore(f"{sst.submitted_user_load}/faiss_index.index")
197
+ if "vectorstore" in sst:
198
+ if new_db is not None: # Check if this is working
199
+ sst.vectorstore.merge_from(new_db)
200
+ sst.conversation = get_conversation_chain(sst.vectorstore)
201
+ st.sidebar.success("faiss loaded")
202
+ else:
203
+ if new_db is not None: # Check if this is working
204
+ sst.vectorstore = new_db
205
+ sst.conversation = get_conversation_chain(new_db)
206
+ st.sidebar.success("faiss loaded")
207
+ else:
208
+ st.sidebar.warning("Couldn't load/find embeddings")
209
+
210
+ st.header("Anna Seiler Haus KI-Assistent ASH :hospital:")
211
+ if st.text_input("ASK_ASH_PASSWORD: ", type="password") == ASK_ASH_PASSWORD:
212
+
213
+ #user_question = st.text_input("Ask a question about your documents:", key="user_query", on_change=handle_query)
214
+ st.text_input('Ask a question about your documents:', key='widget_user_query', on_change=submit_user_query)
215
+ #sst.openai = st.toggle(label="use openai?")
216
+
217
+ if sst.submitted_user_query:
218
+ if "vectorstore" in sst:
219
+ handle_userinput(sst.submitted_user_query)
220
+ else:
221
+ st.warning("no vectorstore loaded.")
222
+
223
+ with st.sidebar:
224
+ st.subheader("Your documents")
225
+ pdf_docs = st.file_uploader("Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
226
+ if st.button("Process"):
227
+ with st.spinner("Processing"):
228
+ vec = get_faiss_vectorstore(get_text_chunks(get_pdf_text(pdf_docs)))
229
+ sst.vectorstore = vec
230
+ sst.conversation = get_conversation_chain(vec)
231
+ st.success("embedding complete")
232
+
233
+ st.text_input('Safe Embeddings to: (copy path of folder)', key='widget_user_safe',
234
+ on_change=submit_user_safe)
235
+
236
+ st.text_input('Load Embeddings from: (copy path of folder)', key='widget_user_load',
237
+ on_change=submit_user_load)
238
+
239
+
240
+ if __name__ == '__main__':
241
+ sst = st.session_state
242
+ ASK_ASH_PASSWORD = os.environ["ASK_ASH_PASSWORD"]
243
+ main()
classify_app.py → util/classify_app.py RENAMED
@@ -1,197 +1,197 @@
1
- import streamlit as st
2
- import os
3
- # import openai
4
- from PyPDF2 import PdfReader
5
- from openai import OpenAI
6
- from langchain.chat_models import ChatOpenAI
7
-
8
- ASK_ASH_PASSWORD = os.environ["ASK_ASH_PASSWORD"]
9
-
10
-
11
- def gpt4_new(prompt_text):
12
- client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))
13
- response = client.chat.completions.create(
14
- model="gpt-4",
15
- messages=[{"role": "system",
16
- "content": "Du bist eine Maschine, auf Grund des Texts von PDF-Dokumenten,"
17
- "das Dokument in vorgegebene Kategorien klassifiziert."
18
- "Du gibts möglichst kurze Antworten, am besten ein Wort"
19
- "Du gibst keine Erklärungen oder Begründungen. "
20
- "Du klassifizierst nur nach den vorgegebenen Kategorien."
21
- "Wenn ein Dokument partout nicht klassifizierbar ist, "
22
- "antwortest du mit '<no classification>'"},
23
- {"role": "user", "content": prompt_text}])
24
- return response.choices[0].message.content
25
-
26
-
27
- # Define a function to ask a question to GPT-4
28
- def ask_gpt4(question):
29
- print(question) # we don't have to submit the question?
30
- try:
31
- # Use the chat function to send a message and get a response
32
- response = ChatOpenAI()
33
- # Extract the response text
34
- return response["choices"][0]["message"]["content"]
35
- except Exception as e:
36
- # Handle exceptions that may occur during the API call
37
- return str(e)
38
-
39
-
40
- def process_prompts_and_save(my_prompts):
41
- # Ensure the responses list is empty initially
42
- responses = []
43
-
44
- # Loop through each prompt in the list
45
- for prompt in my_prompts:
46
- try:
47
- # ADD LOGIC TO READ FILE AND CLASSIFY
48
- # Generate response for each prompt and append to the list
49
- response = ask_gpt4(prompt)
50
- sol = f"{prompt}\n\n{response}\n\n\n\n"
51
- print(sol)
52
- responses.append(sol)
53
- except Exception as e:
54
- # In case of an error, log the error with the prompt
55
- responses.append(f"{prompt}\n\nError:{str(e)}\n\n\n\n")
56
-
57
- # Writing all responses to a text file
58
- with open('gpt4_responses.txt', 'w', encoding='utf-8') as file:
59
- file.writelines(responses)
60
-
61
-
62
- def get_pdfs_text(pdf_docs):
63
- text = ""
64
- for pdf in pdf_docs:
65
- pdf_reader = PdfReader(pdf)
66
- for page in pdf_reader.pages:
67
- text += page.extract_text()
68
- return text
69
-
70
-
71
- def get_pdf_text(pdf_document):
72
- text = ""
73
- pdf_reader = PdfReader(pdf_document)
74
- for page in pdf_reader.pages:
75
- text += page.extract_text()
76
- return text
77
-
78
-
79
- def json_open(filename):
80
- with open(filename, "r") as f:
81
- mydata = f.read()
82
- return mydata
83
-
84
-
85
- def main():
86
- st.title("Doc Classifier")
87
- l, r = st.columns(2)
88
- if st.toggle("show README"):
89
- st.subheader("Funktion: ")
90
- st.write("der Doc Classifier von Elia Wäfler kann einige der BIM2FM Dokumente")
91
- st.write("des ASH nach Disziplin, Doc typ. und Geschoss (später KBOB) klassifizieren.")
92
- st.write("lade ein oder mehrere PDF-Dokumente hoch, um es auszuprobieren.")
93
- st.write("Feedback und Bugs gerne an elia.waefler@insel.ch")
94
- st.write("Vielen Dank.")
95
- st.write("")
96
- with l:
97
- st.subheader("Limitationen: ")
98
- st.write("bisher nur PDFs")
99
- st.write("nur Disziplin, Doc typ. und Geschoss")
100
- st.write("macht teilweise Fehler, vor allem bei Koordination, Datennetz usw, (unklare Disziplinen)")
101
- st.write("")
102
- with r:
103
- st.subheader("geplante Erweiterungen:")
104
- st.write("Text Beschreibung wird von AI hinzugefügt")
105
- st.write("jpg, bilder, tabellen, .xlsx, .docx alles möglich, nicht nur PDF/Text")
106
- st.write("Ecodomus API einbinden, um alle Dokumente zu überprüfen.")
107
-
108
- if st.text_input("ASK_ASH_PASSWORD: ", type="password") == ASK_ASH_PASSWORD:
109
- uploaded_files = st.file_uploader("PDF Dokument", accept_multiple_files=True)
110
- #print(uploaded_file)
111
- #print(uploaded_file.name)
112
-
113
- if st.button("classify KBOB!"):
114
- if uploaded_files is not None:
115
- with st.container():
116
- # col1, col2, col3, col4, col5 = st.columns(5)
117
- col1, col2, col3 = st.columns(3)
118
- all_metadata = []
119
- with col1:
120
- st.write("Disziplin")
121
- st.write(f"")
122
- with col2:
123
- st.write("Dokumententyp")
124
- st.write(f"")
125
- with col3:
126
- st.write("Geschoss")
127
- st.write(f"")
128
-
129
- for file in uploaded_files:
130
- metadata = [file.name]
131
- with col1:
132
- with st.spinner("GPT4 at work"):
133
- pdf_text = str(get_pdf_text(file))
134
- prompt_1 = auftrag_0 + auftrag_1_disziplin + str(Baubranchen_Disziplinen) + pdf_text
135
- answer_1 = gpt4_new(prompt_1)
136
- print(prompt_1)
137
- metadata.append(answer_1)
138
- st.write(answer_1)
139
-
140
- with col2:
141
- with st.spinner("GPT4 at work"):
142
- prompt_2 = auftrag_0 + auftrag_1_type + str(Dokumententypen) + pdf_text
143
- answer_2 = gpt4_new(prompt_2)
144
- print(prompt_2)
145
- metadata.append(answer_2)
146
-
147
- st.write(answer_2)
148
-
149
- with col3:
150
- with st.spinner("GPT4 at work"):
151
- prompt_3 = auftrag_0 + auftrag_1_ge + str(ASH_Geschosse) + pdf_text
152
- answer_3 = gpt4_new(prompt_3)
153
- print(prompt_3)
154
- metadata.append(answer_2)
155
-
156
- st.write(answer_3)
157
-
158
- all_metadata.append(metadata)
159
-
160
- metadata_filename = "ai_generated_metadata.txt"
161
- with open(metadata_filename, 'w', encoding='utf-8') as f:
162
- for line in all_metadata:
163
- f.writelines("\n")
164
- for item in line:
165
- f.writelines(item)
166
- f.writelines(";")
167
-
168
- f.writelines("\n")
169
-
170
- st.success("classified, saved")
171
- st.download_button(f"Download Metadata", json_open(metadata_filename), file_name=metadata_filename)
172
- else:
173
- st.warning("no file")
174
-
175
-
176
- if __name__ == "__main__":
177
- #prompts = ["classify the document, tell me the ", "hello"]
178
- #process_prompts_and_save(prompts)
179
- auftrag_0 = "Klassifiziere dieses Dokument nach "
180
- auftrag_1_disziplin = "diesen 'Baubranchen Disziplinen': "
181
- auftrag_1_type = "diesen 'Dokumententypen': "
182
- auftrag_1_ge = "diesen 'Geschossen': "
183
- Baubranchen_Disziplinen = ['A-Architektur', 'B-Bauphysik', 'C-Rohrpostanlagen', 'D-Datennetz', 'E-Elektroanlagen',
184
- 'F-Fassadenplanung', 'G-Küche', 'H-Heizung', 'I-Innenausbau', 'K-Kälte', 'L-Lüftung',
185
- 'M-Medizintechnik', 'N-Fördertechnik', 'O-Gebäudebetrieb', 'P-Sprinkler',
186
- 'Q-Brandschutz', 'R-Koordination', 'S-Sanitär', 'T-Tragwerksplanung', 'W-Informatik',
187
- 'Z-Lichtplanung']
188
- auftrag_2 = "gib nur den am besten passendsten Eintrag zurück. " \
189
- "Keine weiteren Ausführungen oder Erklärungen. " \
190
- "Antworte am besten in einem Wort. " \
191
- "Hier der Dokumenteninhalt: "
192
- Dokumententypen = ['Fotodokumentation', 'Projektdokumentation (PD)', 'Objektdokumentation (OD)',
193
- 'Prozessdokumentation', 'Fachdokumentation', 'Anlagedokumentation']
194
- ASH_Geschosse = ['U4', 'U3', 'U2', 'U1',
195
- 'A', 'B', 'C', 'D', 'E', 'F', 'G']
196
- #print(str(Baubranchen_Disziplinen))
197
- main()
 
1
+ import streamlit as st
2
+ import os
3
+ # import openai
4
+ from PyPDF2 import PdfReader
5
+ from openai import OpenAI
6
+ from langchain.chat_models import ChatOpenAI
7
+
8
+ ASK_ASH_PASSWORD = os.environ["ASK_ASH_PASSWORD"]
9
+
10
+
11
+ def gpt4_new(prompt_text):
12
+ client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))
13
+ response = client.chat.completions.create(
14
+ model="gpt-4",
15
+ messages=[{"role": "system",
16
+ "content": "Du bist eine Maschine, auf Grund des Texts von PDF-Dokumenten,"
17
+ "das Dokument in vorgegebene Kategorien klassifiziert."
18
+ "Du gibts möglichst kurze Antworten, am besten ein Wort"
19
+ "Du gibst keine Erklärungen oder Begründungen. "
20
+ "Du klassifizierst nur nach den vorgegebenen Kategorien."
21
+ "Wenn ein Dokument partout nicht klassifizierbar ist, "
22
+ "antwortest du mit '<no classification>'"},
23
+ {"role": "user", "content": prompt_text}])
24
+ return response.choices[0].message.content
25
+
26
+
27
+ # Define a function to ask a question to GPT-4
28
+ def ask_gpt4(question):
29
+ print(question) # we don't have to submit the question?
30
+ try:
31
+ # Use the chat function to send a message and get a response
32
+ response = ChatOpenAI()
33
+ # Extract the response text
34
+ return response["choices"][0]["message"]["content"]
35
+ except Exception as e:
36
+ # Handle exceptions that may occur during the API call
37
+ return str(e)
38
+
39
+
40
+ def process_prompts_and_save(my_prompts):
41
+ # Ensure the responses list is empty initially
42
+ responses = []
43
+
44
+ # Loop through each prompt in the list
45
+ for prompt in my_prompts:
46
+ try:
47
+ # ADD LOGIC TO READ FILE AND CLASSIFY
48
+ # Generate response for each prompt and append to the list
49
+ response = ask_gpt4(prompt)
50
+ sol = f"{prompt}\n\n{response}\n\n\n\n"
51
+ print(sol)
52
+ responses.append(sol)
53
+ except Exception as e:
54
+ # In case of an error, log the error with the prompt
55
+ responses.append(f"{prompt}\n\nError:{str(e)}\n\n\n\n")
56
+
57
+ # Writing all responses to a text file
58
+ with open('gpt4_responses.txt', 'w', encoding='utf-8') as file:
59
+ file.writelines(responses)
60
+
61
+
62
+ def get_pdfs_text(pdf_docs):
63
+ text = ""
64
+ for pdf in pdf_docs:
65
+ pdf_reader = PdfReader(pdf)
66
+ for page in pdf_reader.pages:
67
+ text += page.extract_text()
68
+ return text
69
+
70
+
71
+ def get_pdf_text(pdf_document):
72
+ text = ""
73
+ pdf_reader = PdfReader(pdf_document)
74
+ for page in pdf_reader.pages:
75
+ text += page.extract_text()
76
+ return text
77
+
78
+
79
+ def json_open(filename):
80
+ with open(filename, "r") as f:
81
+ mydata = f.read()
82
+ return mydata
83
+
84
+
85
+ def main():
86
+ st.title("Doc Classifier")
87
+ l, r = st.columns(2)
88
+ if st.toggle("show README"):
89
+ st.subheader("Funktion: ")
90
+ st.write("der Doc Classifier von Elia Wäfler kann einige der BIM2FM Dokumente")
91
+ st.write("des ASH nach Disziplin, Doc typ. und Geschoss (später KBOB) klassifizieren.")
92
+ st.write("lade ein oder mehrere PDF-Dokumente hoch, um es auszuprobieren.")
93
+ st.write("Feedback und Bugs gerne an elia.waefler@insel.ch")
94
+ st.write("Vielen Dank.")
95
+ st.write("")
96
+ with l:
97
+ st.subheader("Limitationen: ")
98
+ st.write("bisher nur PDFs")
99
+ st.write("nur Disziplin, Doc typ. und Geschoss")
100
+ st.write("macht teilweise Fehler, vor allem bei Koordination, Datennetz usw, (unklare Disziplinen)")
101
+ st.write("")
102
+ with r:
103
+ st.subheader("geplante Erweiterungen:")
104
+ st.write("Text Beschreibung wird von AI hinzugefügt")
105
+ st.write("jpg, bilder, tabellen, .xlsx, .docx alles möglich, nicht nur PDF/Text")
106
+ st.write("Ecodomus API einbinden, um alle Dokumente zu überprüfen.")
107
+
108
+ if st.text_input("ASK_ASH_PASSWORD: ", type="password") == ASK_ASH_PASSWORD:
109
+ uploaded_files = st.file_uploader("PDF Dokument", accept_multiple_files=True)
110
+ #print(uploaded_file)
111
+ #print(uploaded_file.name)
112
+
113
+ if st.button("classify KBOB!"):
114
+ if uploaded_files is not None:
115
+ with st.container():
116
+ # col1, col2, col3, col4, col5 = st.columns(5)
117
+ col1, col2, col3 = st.columns(3)
118
+ all_metadata = []
119
+ with col1:
120
+ st.write("Disziplin")
121
+ st.write(f"")
122
+ with col2:
123
+ st.write("Dokumententyp")
124
+ st.write(f"")
125
+ with col3:
126
+ st.write("Geschoss")
127
+ st.write(f"")
128
+
129
+ for file in uploaded_files:
130
+ metadata = [file.name]
131
+ with col1:
132
+ with st.spinner("GPT4 at work"):
133
+ pdf_text = str(get_pdf_text(file))
134
+ prompt_1 = auftrag_0 + auftrag_1_disziplin + str(Baubranchen_Disziplinen) + pdf_text
135
+ answer_1 = gpt4_new(prompt_1)
136
+ print(prompt_1)
137
+ metadata.append(answer_1)
138
+ st.write(answer_1)
139
+
140
+ with col2:
141
+ with st.spinner("GPT4 at work"):
142
+ prompt_2 = auftrag_0 + auftrag_1_type + str(Dokumententypen) + pdf_text
143
+ answer_2 = gpt4_new(prompt_2)
144
+ print(prompt_2)
145
+ metadata.append(answer_2)
146
+
147
+ st.write(answer_2)
148
+
149
+ with col3:
150
+ with st.spinner("GPT4 at work"):
151
+ prompt_3 = auftrag_0 + auftrag_1_ge + str(ASH_Geschosse) + pdf_text
152
+ answer_3 = gpt4_new(prompt_3)
153
+ print(prompt_3)
154
+ metadata.append(answer_2)
155
+
156
+ st.write(answer_3)
157
+
158
+ all_metadata.append(metadata)
159
+
160
+ metadata_filename = "ai_generated_metadata.txt"
161
+ with open(metadata_filename, 'w', encoding='utf-8') as f:
162
+ for line in all_metadata:
163
+ f.writelines("\n")
164
+ for item in line:
165
+ f.writelines(item)
166
+ f.writelines(";")
167
+
168
+ f.writelines("\n")
169
+
170
+ st.success("classified, saved")
171
+ st.download_button(f"Download Metadata", json_open(metadata_filename), file_name=metadata_filename)
172
+ else:
173
+ st.warning("no file")
174
+
175
+
176
+ if __name__ == "__main__":
177
+ #prompts = ["classify the document, tell me the ", "hello"]
178
+ #process_prompts_and_save(prompts)
179
+ auftrag_0 = "Klassifiziere dieses Dokument nach "
180
+ auftrag_1_disziplin = "diesen 'Baubranchen Disziplinen': "
181
+ auftrag_1_type = "diesen 'Dokumententypen': "
182
+ auftrag_1_ge = "diesen 'Geschossen': "
183
+ Baubranchen_Disziplinen = ['A-Architektur', 'B-Bauphysik', 'C-Rohrpostanlagen', 'D-Datennetz', 'E-Elektroanlagen',
184
+ 'F-Fassadenplanung', 'G-Küche', 'H-Heizung', 'I-Innenausbau', 'K-Kälte', 'L-Lüftung',
185
+ 'M-Medizintechnik', 'N-Fördertechnik', 'O-Gebäudebetrieb', 'P-Sprinkler',
186
+ 'Q-Brandschutz', 'R-Koordination', 'S-Sanitär', 'T-Tragwerksplanung', 'W-Informatik',
187
+ 'Z-Lichtplanung']
188
+ auftrag_2 = "gib nur den am besten passendsten Eintrag zurück. " \
189
+ "Keine weiteren Ausführungen oder Erklärungen. " \
190
+ "Antworte am besten in einem Wort. " \
191
+ "Hier der Dokumenteninhalt: "
192
+ Dokumententypen = ['Fotodokumentation', 'Projektdokumentation (PD)', 'Objektdokumentation (OD)',
193
+ 'Prozessdokumentation', 'Fachdokumentation', 'Anlagedokumentation']
194
+ ASH_Geschosse = ['U4', 'U3', 'U2', 'U1',
195
+ 'A', 'B', 'C', 'D', 'E', 'F', 'G']
196
+ #print(str(Baubranchen_Disziplinen))
197
+ main()
ingest.py → util/ingest.py RENAMED
@@ -1,126 +1,126 @@
1
- from PyPDF2 import PdfReader
2
- from langchain.text_splitter import CharacterTextSplitter
3
- import tabula
4
- import io
5
- import fitz # PyMuPDF
6
- import pdfplumber
7
- from PIL import Image
8
- import io
9
-
10
-
11
- def get_pdf_tables(pdf_bytes):
12
- """
13
- Extracts tables from a PDF file loaded directly from bytes.
14
-
15
- Args:
16
- pdf_bytes (bytes): The byte content of the PDF file.
17
-
18
- Returns:
19
- List[pandas.DataFrame]: A list of DataFrames, each representing a table extracted from the PDF.
20
- """
21
- tables = []
22
- with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
23
- for page in pdf.pages:
24
- # Extract tables from the current page
25
- page_tables = page.extract_tables()
26
- for table in page_tables:
27
- # Convert table to a DataFrame and append to the list
28
- tables.append(table)
29
-
30
- # Optionally convert lists of lists (tables) to pandas DataFrames
31
- import pandas as pd
32
- dataframes = [pd.DataFrame(table[1:], columns=table[0]) for table in tables if table]
33
- return dataframes
34
-
35
-
36
- def get_pdf_images(pdf_bytes):
37
- """
38
- Extracts images and captures screenshots of each page from a given PDF's bytes.
39
-
40
- Args:
41
- pdf_bytes (bytes): The byte content of the PDF file.
42
-
43
- Returns:
44
- List[bytes]: A list of image bytes extracted from the PDF, including screenshots of each page.
45
- """
46
- images = []
47
- pdf_stream = io.BytesIO(pdf_bytes)
48
- doc = fitz.open("pdf", pdf_stream.read())
49
-
50
- for page_num, page in enumerate(doc):
51
- # Take a screenshot of the current page
52
- pix = page.get_pixmap() # This line captures the page as an image
53
- img_bytes = pix.tobytes("png") # Save the pixmap as PNG bytes
54
- images.append(img_bytes) # Append the screenshot to the list of images
55
-
56
- # Extract embedded images
57
- for img_index, img in enumerate(page.get_images(full=True)):
58
- xref = img[0]
59
- base_image = doc.extract_image(xref)
60
- image_bytes = base_image["image"]
61
- images.append(image_bytes)
62
-
63
- doc.close()
64
- return images
65
-
66
-
67
- def get_pdf_old_tables(pdf_bytes):
68
- """
69
- Extracts tables from a given PDF's bytes using Tabula.
70
- Args:
71
- pdf_bytes (bytes): The byte content of the PDF file.
72
-
73
- Returns:
74
- List[pandas.DataFrame]: A list of DataFrames, each representing a table extracted from the PDF.
75
- """
76
- pdf_stream = io.BytesIO(pdf_bytes)
77
- # Read PDF into list of DataFrame
78
- tables = tabula.read_pdf(pdf_stream, pages='all', multiple_tables=True)
79
- return tables
80
-
81
-
82
- def get_pdf_text(pdf_docs):
83
- text = ""
84
- if type(pdf_docs) == list:
85
- for pdf in pdf_docs:
86
- pdf_reader = PdfReader(pdf)
87
- for page in pdf_reader.pages:
88
- text += page.extract_text()
89
- else:
90
- pdf_reader = PdfReader(pdf_docs)
91
- for page in pdf_reader.pages:
92
- text += page.extract_text()
93
- return text
94
-
95
-
96
- def get_text_chunks(text):
97
- text_splitter = CharacterTextSplitter(
98
- separator="\n",
99
- chunk_size=1000,
100
- chunk_overlap=200,
101
- length_function=len
102
- )
103
- chunks = text_splitter.split_text(text)
104
- return chunks
105
-
106
-
107
- def extract_images_from_pdf_path(pdf_path):
108
- doc = fitz.open(pdf_path)
109
- images = []
110
- for i in range(len(doc)):
111
- for img in doc.get_page_images(i):
112
- xref = img[0]
113
- base = img[1]
114
- img_data = doc.extract_image(xref)
115
- img_bytes = img_data['image']
116
-
117
- image = Image.open(io.BytesIO(img_bytes))
118
- images.append(image)
119
-
120
- return images
121
-
122
-
123
- def get_tables_from_pdf_path(pdf_path):
124
- # read_pdf will save the pdf table into Pandas Dataframe
125
- tables = tabula.read_pdf(pdf_path, pages='all')
126
- return tables
 
1
+ from PyPDF2 import PdfReader
2
+ from langchain.text_splitter import CharacterTextSplitter
3
+ import tabula
4
+ import io
5
+ import fitz # PyMuPDF
6
+ import pdfplumber
7
+ from PIL import Image
8
+ import io
9
+
10
+
11
+ def get_pdf_tables(pdf_bytes):
12
+ """
13
+ Extracts tables from a PDF file loaded directly from bytes.
14
+
15
+ Args:
16
+ pdf_bytes (bytes): The byte content of the PDF file.
17
+
18
+ Returns:
19
+ List[pandas.DataFrame]: A list of DataFrames, each representing a table extracted from the PDF.
20
+ """
21
+ tables = []
22
+ with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
23
+ for page in pdf.pages:
24
+ # Extract tables from the current page
25
+ page_tables = page.extract_tables()
26
+ for table in page_tables:
27
+ # Convert table to a DataFrame and append to the list
28
+ tables.append(table)
29
+
30
+ # Optionally convert lists of lists (tables) to pandas DataFrames
31
+ import pandas as pd
32
+ dataframes = [pd.DataFrame(table[1:], columns=table[0]) for table in tables if table]
33
+ return dataframes
34
+
35
+
36
+ def get_pdf_images(pdf_bytes):
37
+ """
38
+ Extracts images and captures screenshots of each page from a given PDF's bytes.
39
+
40
+ Args:
41
+ pdf_bytes (bytes): The byte content of the PDF file.
42
+
43
+ Returns:
44
+ List[bytes]: A list of image bytes extracted from the PDF, including screenshots of each page.
45
+ """
46
+ images = []
47
+ pdf_stream = io.BytesIO(pdf_bytes)
48
+ doc = fitz.open("pdf", pdf_stream.read())
49
+
50
+ for page_num, page in enumerate(doc):
51
+ # Take a screenshot of the current page
52
+ pix = page.get_pixmap() # This line captures the page as an image
53
+ img_bytes = pix.tobytes("png") # Save the pixmap as PNG bytes
54
+ images.append(img_bytes) # Append the screenshot to the list of images
55
+
56
+ # Extract embedded images
57
+ for img_index, img in enumerate(page.get_images(full=True)):
58
+ xref = img[0]
59
+ base_image = doc.extract_image(xref)
60
+ image_bytes = base_image["image"]
61
+ images.append(image_bytes)
62
+
63
+ doc.close()
64
+ return images
65
+
66
+
67
+ def get_pdf_old_tables(pdf_bytes):
68
+ """
69
+ Extracts tables from a given PDF's bytes using Tabula.
70
+ Args:
71
+ pdf_bytes (bytes): The byte content of the PDF file.
72
+
73
+ Returns:
74
+ List[pandas.DataFrame]: A list of DataFrames, each representing a table extracted from the PDF.
75
+ """
76
+ pdf_stream = io.BytesIO(pdf_bytes)
77
+ # Read PDF into list of DataFrame
78
+ tables = tabula.read_pdf(pdf_stream, pages='all', multiple_tables=True)
79
+ return tables
80
+
81
+
82
+ def get_pdf_text(pdf_docs):
83
+ text = ""
84
+ if type(pdf_docs) == list:
85
+ for pdf in pdf_docs:
86
+ pdf_reader = PdfReader(pdf)
87
+ for page in pdf_reader.pages:
88
+ text += page.extract_text()
89
+ else:
90
+ pdf_reader = PdfReader(pdf_docs)
91
+ for page in pdf_reader.pages:
92
+ text += page.extract_text()
93
+ return text
94
+
95
+
96
+ def get_text_chunks(text):
97
+ text_splitter = CharacterTextSplitter(
98
+ separator="\n",
99
+ chunk_size=1000,
100
+ chunk_overlap=200,
101
+ length_function=len
102
+ )
103
+ chunks = text_splitter.split_text(text)
104
+ return chunks
105
+
106
+
107
+ def extract_images_from_pdf_path(pdf_path):
108
+ doc = fitz.open(pdf_path)
109
+ images = []
110
+ for i in range(len(doc)):
111
+ for img in doc.get_page_images(i):
112
+ xref = img[0]
113
+ base = img[1]
114
+ img_data = doc.extract_image(xref)
115
+ img_bytes = img_data['image']
116
+
117
+ image = Image.open(io.BytesIO(img_bytes))
118
+ images.append(image)
119
+
120
+ return images
121
+
122
+
123
+ def get_tables_from_pdf_path(pdf_path):
124
+ # read_pdf will save the pdf table into Pandas Dataframe
125
+ tables = tabula.read_pdf(pdf_path, pages='all')
126
+ return tables
my_1_reader.py → util/my_1_reader.py RENAMED
@@ -1,201 +1,201 @@
1
- # MUSS AUFGERÄUMT WERDEN
2
-
3
- import json
4
- import os
5
- import subprocess
6
- import PyPDF2
7
- import csv
8
- import fitz # PyMuPDF
9
-
10
-
11
- def extract_text_from_pdf(pdf_path):
12
- """
13
- Extracts all text from a PDF file.
14
-
15
- :param pdf_path: Path to the PDF file.
16
- :return: Extracted text as a string.
17
- """
18
- # Open the PDF file
19
- doc = fitz.open(pdf_path)
20
-
21
- # Initialize an empty string to hold the text
22
- text = ''
23
-
24
- # Iterate through each page in the PDF
25
- for page_num in range(len(doc)):
26
- # Get a page
27
- page = doc.load_page(page_num)
28
-
29
- # Extract text from the page and add it to the result
30
- text += page.get_text()
31
-
32
- # Close the document
33
- doc.close()
34
-
35
- return text
36
-
37
-
38
- def read_pdfs_from_folder(folder_path):
39
- """
40
- Reads all PDF files in the specified folder using PdfReader and extracts their text.
41
-
42
- Parameters:
43
- - folder_path: The path to the folder containing PDF files.
44
-
45
- Returns:
46
- - A dictionary with file names as keys and their extracted text as values.
47
- """
48
- pdf_texts = {}
49
- for filename in os.listdir(folder_path):
50
- if filename.endswith('.pdf'):
51
- file_path = os.path.join(folder_path, filename)
52
- with open(file_path, 'rb') as pdf_file:
53
- pdf_reader = PyPDF2.PdfReader(pdf_file)
54
- text = ''
55
- for page in pdf_reader.pages:
56
- try:
57
- text += page.extract_text()
58
- except UnicodeDecodeError as e:
59
- print(e)
60
- for c in text:
61
- if c in ["ä", "Ä"]:
62
- text = text[:text.index(c)] + "ae" + text[text.index(c)+1:]
63
- if c in ["ö", "Ö"]:
64
- text = text[:text.index(c)] + "oe" + text[text.index(c)+1:]
65
- if c in ["ü", "Ü"]:
66
- text = text[:text.index(c)] + "ue" + text[text.index(c)+1:]
67
- if c in [",", ";", "\\", '"']:
68
- text = text[:text.index(c)] + "_" + text[text.index(c)+1:]
69
- if c in ["/n", "\n"]:
70
- text = text[:text.index(c)] + "<newline>" + text[text.index(c) + 1:]
71
- pdf_texts[filename] = text
72
- return pdf_texts
73
-
74
-
75
- def read_csv_lines_as_strings(filename):
76
- """
77
- Opens a CSV file and returns each line as a string in a list.
78
-
79
- Parameters:
80
- - filename: The path to the CSV file.
81
-
82
- Returns:
83
- - A list of strings, each representing a line from the CSV file.
84
- """
85
- lines_as_strings = []
86
- with open(filename, newline='') as csvfile:
87
- try:
88
- reader = csv.reader(csvfile)
89
- for row in reader:
90
- # Convert the row (a list of values) back into a comma-separated string
91
- line_as_string = ','.join(row)
92
- lines_as_strings.append(line_as_string)
93
- except UnicodeDecodeError as e:
94
- print(e)
95
- return lines_as_strings
96
-
97
-
98
- # Function to load data from JSON files
99
- def load_data(filename):
100
- with open(filename, 'r') as file:
101
- try:
102
- return json.load(file)
103
- except UnicodeDecodeError as err:
104
- print(err)
105
- return {}
106
-
107
-
108
- def find_and_open_file(filename, start_directory):
109
- """
110
- Attempts to open a file with the given filename starting from the specified directory.
111
- If the file is not found, searches recursively in all subfolders. Works across macOS, Linux, and Windows.
112
- """
113
- for root, dirs, files in os.walk(start_directory):
114
- if filename in files:
115
- filepath = os.path.join(root, filename)
116
- print(f"File found: {filepath}")
117
- return filepath
118
- print(f"File {filename} not found.")
119
- return None
120
-
121
-
122
- def open_file(filepath):
123
- """
124
- Opens the file with the default application, based on the operating system.
125
- """
126
- if os.path.exists(filepath):
127
- if os.name == 'posix': # Linux, macOS, etc.
128
- subprocess.call(('open', filepath))
129
- elif os.name == 'nt': # Windows
130
- os.startfile(filepath)
131
- else:
132
- print(f"Cannot open file on this operating system: {filepath}")
133
- else:
134
- print(f"File does not exist: {filepath}")
135
-
136
-
137
- def list_folders_files_recursive(path, depth=0):
138
- """
139
- Recursively lists all folders and files within the specified path, including subfolders.
140
-
141
- Parameters:
142
- - path: The directory path to list contents from.
143
- - depth: The current depth of recursion (used for indentation in print statements).
144
-
145
- Returns:
146
- - None
147
- """
148
- # Ensure the provided path is a directory
149
- if not os.path.isdir(path):
150
- print(f"The provided path '{path}' is not a valid directory.")
151
- return
152
-
153
- indent = ' ' * depth # Indentation based on recursion depth
154
- folders, files = [], []
155
-
156
- # List all entries in the directory
157
- for entry in os.listdir(path):
158
- full_path = os.path.join(path, entry)
159
- if os.path.isdir(full_path):
160
- folders.append(entry)
161
- print(f"{indent}Folder: {entry}")
162
- # Recursively list subfolders and files
163
- list_folders_files_recursive(full_path, depth + 1)
164
- elif os.path.isfile(full_path):
165
- files.append(entry)
166
-
167
- for f in files:
168
- print(f"{indent}File: {f}")
169
-
170
-
171
- def list_folders_files(path):
172
- """
173
- Lists all folders and files within the specified path.
174
-
175
- Parameters:
176
- - path: The directory path to list contents from.
177
-
178
- Returns:
179
- - A tuple of two lists: (folders, files).
180
- """
181
- folders = []
182
- files = []
183
-
184
- # Ensure the provided path is a directory
185
- if not os.path.isdir(path):
186
- print(f"The provided path '{path}' is not a valid directory.")
187
- return folders, files
188
-
189
- # List all entries in the directory
190
- for entry in os.listdir(path):
191
- full_path = os.path.join(path, entry)
192
- if os.path.isdir(full_path):
193
- folders.append(entry)
194
- elif os.path.isfile(full_path):
195
- files.append(entry)
196
-
197
- return folders, files
198
-
199
-
200
- if __name__ == "__main__":
201
- print("here are all functions that read files")
 
1
+ # MUSS AUFGERÄUMT WERDEN
2
+
3
+ import json
4
+ import os
5
+ import subprocess
6
+ import PyPDF2
7
+ import csv
8
+ import fitz # PyMuPDF
9
+
10
+
11
+ def extract_text_from_pdf(pdf_path):
12
+ """
13
+ Extracts all text from a PDF file.
14
+
15
+ :param pdf_path: Path to the PDF file.
16
+ :return: Extracted text as a string.
17
+ """
18
+ # Open the PDF file
19
+ doc = fitz.open(pdf_path)
20
+
21
+ # Initialize an empty string to hold the text
22
+ text = ''
23
+
24
+ # Iterate through each page in the PDF
25
+ for page_num in range(len(doc)):
26
+ # Get a page
27
+ page = doc.load_page(page_num)
28
+
29
+ # Extract text from the page and add it to the result
30
+ text += page.get_text()
31
+
32
+ # Close the document
33
+ doc.close()
34
+
35
+ return text
36
+
37
+
38
+ def read_pdfs_from_folder(folder_path):
39
+ """
40
+ Reads all PDF files in the specified folder using PdfReader and extracts their text.
41
+
42
+ Parameters:
43
+ - folder_path: The path to the folder containing PDF files.
44
+
45
+ Returns:
46
+ - A dictionary with file names as keys and their extracted text as values.
47
+ """
48
+ pdf_texts = {}
49
+ for filename in os.listdir(folder_path):
50
+ if filename.endswith('.pdf'):
51
+ file_path = os.path.join(folder_path, filename)
52
+ with open(file_path, 'rb') as pdf_file:
53
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
54
+ text = ''
55
+ for page in pdf_reader.pages:
56
+ try:
57
+ text += page.extract_text()
58
+ except UnicodeDecodeError as e:
59
+ print(e)
60
+ for c in text:
61
+ if c in ["ä", "Ä"]:
62
+ text = text[:text.index(c)] + "ae" + text[text.index(c)+1:]
63
+ if c in ["ö", "Ö"]:
64
+ text = text[:text.index(c)] + "oe" + text[text.index(c)+1:]
65
+ if c in ["ü", "Ü"]:
66
+ text = text[:text.index(c)] + "ue" + text[text.index(c)+1:]
67
+ if c in [",", ";", "\\", '"']:
68
+ text = text[:text.index(c)] + "_" + text[text.index(c)+1:]
69
+ if c in ["/n", "\n"]:
70
+ text = text[:text.index(c)] + "<newline>" + text[text.index(c) + 1:]
71
+ pdf_texts[filename] = text
72
+ return pdf_texts
73
+
74
+
75
+ def read_csv_lines_as_strings(filename):
76
+ """
77
+ Opens a CSV file and returns each line as a string in a list.
78
+
79
+ Parameters:
80
+ - filename: The path to the CSV file.
81
+
82
+ Returns:
83
+ - A list of strings, each representing a line from the CSV file.
84
+ """
85
+ lines_as_strings = []
86
+ with open(filename, newline='') as csvfile:
87
+ try:
88
+ reader = csv.reader(csvfile)
89
+ for row in reader:
90
+ # Convert the row (a list of values) back into a comma-separated string
91
+ line_as_string = ','.join(row)
92
+ lines_as_strings.append(line_as_string)
93
+ except UnicodeDecodeError as e:
94
+ print(e)
95
+ return lines_as_strings
96
+
97
+
98
+ # Function to load data from JSON files
99
+ def load_data(filename):
100
+ with open(filename, 'r') as file:
101
+ try:
102
+ return json.load(file)
103
+ except UnicodeDecodeError as err:
104
+ print(err)
105
+ return {}
106
+
107
+
108
+ def find_and_open_file(filename, start_directory):
109
+ """
110
+ Attempts to open a file with the given filename starting from the specified directory.
111
+ If the file is not found, searches recursively in all subfolders. Works across macOS, Linux, and Windows.
112
+ """
113
+ for root, dirs, files in os.walk(start_directory):
114
+ if filename in files:
115
+ filepath = os.path.join(root, filename)
116
+ print(f"File found: {filepath}")
117
+ return filepath
118
+ print(f"File {filename} not found.")
119
+ return None
120
+
121
+
122
+ def open_file(filepath):
123
+ """
124
+ Opens the file with the default application, based on the operating system.
125
+ """
126
+ if os.path.exists(filepath):
127
+ if os.name == 'posix': # Linux, macOS, etc.
128
+ subprocess.call(('open', filepath))
129
+ elif os.name == 'nt': # Windows
130
+ os.startfile(filepath)
131
+ else:
132
+ print(f"Cannot open file on this operating system: {filepath}")
133
+ else:
134
+ print(f"File does not exist: {filepath}")
135
+
136
+
137
+ def list_folders_files_recursive(path, depth=0):
138
+ """
139
+ Recursively lists all folders and files within the specified path, including subfolders.
140
+
141
+ Parameters:
142
+ - path: The directory path to list contents from.
143
+ - depth: The current depth of recursion (used for indentation in print statements).
144
+
145
+ Returns:
146
+ - None
147
+ """
148
+ # Ensure the provided path is a directory
149
+ if not os.path.isdir(path):
150
+ print(f"The provided path '{path}' is not a valid directory.")
151
+ return
152
+
153
+ indent = ' ' * depth # Indentation based on recursion depth
154
+ folders, files = [], []
155
+
156
+ # List all entries in the directory
157
+ for entry in os.listdir(path):
158
+ full_path = os.path.join(path, entry)
159
+ if os.path.isdir(full_path):
160
+ folders.append(entry)
161
+ print(f"{indent}Folder: {entry}")
162
+ # Recursively list subfolders and files
163
+ list_folders_files_recursive(full_path, depth + 1)
164
+ elif os.path.isfile(full_path):
165
+ files.append(entry)
166
+
167
+ for f in files:
168
+ print(f"{indent}File: {f}")
169
+
170
+
171
+ def list_folders_files(path):
172
+ """
173
+ Lists all folders and files within the specified path.
174
+
175
+ Parameters:
176
+ - path: The directory path to list contents from.
177
+
178
+ Returns:
179
+ - A tuple of two lists: (folders, files).
180
+ """
181
+ folders = []
182
+ files = []
183
+
184
+ # Ensure the provided path is a directory
185
+ if not os.path.isdir(path):
186
+ print(f"The provided path '{path}' is not a valid directory.")
187
+ return folders, files
188
+
189
+ # List all entries in the directory
190
+ for entry in os.listdir(path):
191
+ full_path = os.path.join(path, entry)
192
+ if os.path.isdir(full_path):
193
+ folders.append(entry)
194
+ elif os.path.isfile(full_path):
195
+ files.append(entry)
196
+
197
+ return folders, files
198
+
199
+
200
+ if __name__ == "__main__":
201
+ print("here are all functions that read files")
my_1_writer.py → util/my_1_writer.py RENAMED
File without changes
my_2_sim_search.py → util/my_2_sim_search.py RENAMED
@@ -1,163 +1,163 @@
1
- import my_new_openai
2
- import my_1_writer
3
- import json
4
- import numpy as np
5
-
6
-
7
- # sim search with dot_product and lin_distance
8
- # the newly vectorized TERM will be added to the database
9
- # database = .json file
10
- def sim_search_load_db(database, term, add_to_db=True, debug=False):
11
- if type(term) == str:
12
- print("str")
13
- vector1 = my_new_openai.vectorize_data(term)
14
- elif type(term) == list:
15
- print("list")
16
- vector1 = term
17
- else:
18
- print("invalid search_term/search_vector format")
19
- return
20
- with open(database, "r") as f:
21
- table = json.load(f)
22
- sim_search_dict = {}
23
- for key in table.keys():
24
- vector2 = table[key]
25
- if debug:
26
- print("")
27
- print(f"{vector1}")
28
- print(f"{vector2}")
29
- print(f"doing dot product for {key} and {term}")
30
- dp = np.dot(vector1, vector2)
31
- distance = np.linalg.norm(np.array(vector1) - np.array(vector2))
32
- if debug:
33
- print(f"the dp is {dp}")
34
- print(f"the distance is{distance}")
35
- print("")
36
- print("")
37
- print("")
38
- sim_search_dict[key] = dp * distance
39
-
40
- # sort with the biggest similarity
41
- sorted_table = dict(sorted(sim_search_dict.items(), key=lambda item: item[1]), reversed=True)
42
-
43
- if debug:
44
- for key, value in sorted_table[:5]:
45
- print(f"{key}: {value}")
46
- if add_to_db:
47
-
48
- if term in table.keys():
49
- print("the search term is in the database!")
50
- # add the newly vectorized term to the words, if not already in the vector table
51
- else:
52
- if database != "session/my_words_vec_table.json":
53
- database = "session/my_vecs.json"
54
- # table = load_df(database) # ??
55
- table[str(term)] = vector1
56
- my_1_writer.safe_my_dict_as_json(database, table)
57
- # first_key, first_value = list(sortedTable.items())[0]
58
- print(f"the closest word to your input is: {list(sorted_table.keys())[0]}")
59
- return sorted_table
60
-
61
-
62
- def dot_p_to_1(database, vector1=0, analysis_filename=0):
63
-
64
- with open(database, "r") as f:
65
- table = json.load(f)
66
- dot_product_to1 = {}
67
-
68
- if vector1 == 0:
69
- vector1 = [0.025515518153991442 for _ in range(1536)]
70
- elif vector1 == 1:
71
- vector1 = table[str(list(table.keys())[0])]
72
-
73
- for key in table.keys():
74
- dot_product_to1[key] = np.dot(vector1, table[key])
75
- my_1_writer.safe_my_dict_as_json(analysis_filename, dot_product_to1)
76
- print("dot p to 1 saved")
77
-
78
-
79
- def lin_dist(database, vector1=0, analysis_filename=0):
80
- with open(database, "r") as f:
81
- table = json.load(f)
82
- lin_dist_to_1 = {}
83
-
84
- if vector1 == 0:
85
- vector1 = [0.025515518153991442 for _ in range(1536)]
86
- elif vector1 == 1:
87
- vector1 = table[str(list(table.keys())[0])]
88
-
89
- for key in table.keys():
90
- lin_dist_to_1[key] = np.linalg.norm(np.array(vector1) - np.array(table[key]))
91
-
92
- my_1_writer.safe_my_dict_as_json(analysis_filename, lin_dist_to_1)
93
- print("lin dist to 1 saved")
94
-
95
-
96
- def manhattan_dist(database, vector1=0, analysis_filename=0):
97
- with open(database, "r") as f:
98
- table = json.load(f)
99
- manhattan_dist_to_1 = {}
100
-
101
- if vector1 == 0:
102
- vector1 = [0.025515518153991442 for _ in range(1536)]
103
- elif vector1 == 1:
104
- vector1 = table[str(list(table.keys())[0])]
105
-
106
- for key in table.keys():
107
- manhattan_dist_to_1[key] = sum(np.array(vector1) - np.array(table[key]))
108
-
109
- my_1_writer.safe_my_dict_as_json(analysis_filename, manhattan_dist_to_1)
110
- print("manhattan dist to 1 saved")
111
-
112
-
113
- #vec_table
114
- def sim_search_fly(vec_table, term, debug=False):
115
- if debug:
116
- print(type(vec_table))
117
- print(type(term))
118
- print(type(vec_table[list(vec_table.keys())[0]]))
119
- print("vec table:")
120
- print(vec_table[list(vec_table.keys())[5]][:4])
121
- print("search term")
122
- print(term[:4])
123
- if type(term) == str:
124
- print("str")
125
- vector1 = my_new_openai.vectorize_data(term)
126
- elif type(term) == list:
127
- print("list")
128
- vector1 = term
129
- else:
130
- print("invalid search_term/search_vector format")
131
- return
132
-
133
- sim_search_dict = {}
134
- for key in vec_table.keys():
135
- vector2 = vec_table[key]
136
- if debug:
137
- print("")
138
- print(f"{vector1}")
139
- print(f"{vector2}")
140
- print(f"doing dot product for {key} and {term}")
141
- if vector2[0] == vector2[1] and vector2[3] == vector2[4] and vector2[5] == vector2[6]:
142
- dp = 200
143
- else:
144
- dp = np.dot(vector1, vector2)
145
- #distance = np.linalg.norm(np.array(vector1) - np.array(vector2))
146
- if debug:
147
- print(f"the dp is {dp}")
148
- #print(f"the distance is{distance}")
149
- print("")
150
- print("")
151
- print("")
152
- sim_search_dict[key] = dp #* distance
153
-
154
- # sort with the biggest similarity
155
- sorted_table = dict(sorted(sim_search_dict.items(), key=lambda item: item[1]), reversed=True)
156
-
157
- if debug:
158
- for key, value in sorted_table[:5]:
159
- print(f"{key}: {value}")
160
-
161
- # first_key, first_value = list(sortedTable.items())[0]
162
- print(f"the closest word to your input is: {list(sorted_table.keys())[0]}")
163
- return sorted_table
 
1
+ import my_new_openai
2
+ import my_1_writer
3
+ import json
4
+ import numpy as np
5
+
6
+
7
+ # sim search with dot_product and lin_distance
8
+ # the newly vectorized TERM will be added to the database
9
+ # database = .json file
10
+ def sim_search_load_db(database, term, add_to_db=True, debug=False):
11
+ if type(term) == str:
12
+ print("str")
13
+ vector1 = my_new_openai.vectorize_data(term)
14
+ elif type(term) == list:
15
+ print("list")
16
+ vector1 = term
17
+ else:
18
+ print("invalid search_term/search_vector format")
19
+ return
20
+ with open(database, "r") as f:
21
+ table = json.load(f)
22
+ sim_search_dict = {}
23
+ for key in table.keys():
24
+ vector2 = table[key]
25
+ if debug:
26
+ print("")
27
+ print(f"{vector1}")
28
+ print(f"{vector2}")
29
+ print(f"doing dot product for {key} and {term}")
30
+ dp = np.dot(vector1, vector2)
31
+ distance = np.linalg.norm(np.array(vector1) - np.array(vector2))
32
+ if debug:
33
+ print(f"the dp is {dp}")
34
+ print(f"the distance is{distance}")
35
+ print("")
36
+ print("")
37
+ print("")
38
+ sim_search_dict[key] = dp * distance
39
+
40
+ # sort with the biggest similarity
41
+ sorted_table = dict(sorted(sim_search_dict.items(), key=lambda item: item[1]), reversed=True)
42
+
43
+ if debug:
44
+ for key, value in sorted_table[:5]:
45
+ print(f"{key}: {value}")
46
+ if add_to_db:
47
+
48
+ if term in table.keys():
49
+ print("the search term is in the database!")
50
+ # add the newly vectorized term to the words, if not already in the vector table
51
+ else:
52
+ if database != "session/my_words_vec_table.json":
53
+ database = "session/my_vecs.json"
54
+ # table = load_df(database) # ??
55
+ table[str(term)] = vector1
56
+ my_1_writer.safe_my_dict_as_json(database, table)
57
+ # first_key, first_value = list(sortedTable.items())[0]
58
+ print(f"the closest word to your input is: {list(sorted_table.keys())[0]}")
59
+ return sorted_table
60
+
61
+
62
+ def dot_p_to_1(database, vector1=0, analysis_filename=0):
63
+
64
+ with open(database, "r") as f:
65
+ table = json.load(f)
66
+ dot_product_to1 = {}
67
+
68
+ if vector1 == 0:
69
+ vector1 = [0.025515518153991442 for _ in range(1536)]
70
+ elif vector1 == 1:
71
+ vector1 = table[str(list(table.keys())[0])]
72
+
73
+ for key in table.keys():
74
+ dot_product_to1[key] = np.dot(vector1, table[key])
75
+ my_1_writer.safe_my_dict_as_json(analysis_filename, dot_product_to1)
76
+ print("dot p to 1 saved")
77
+
78
+
79
+ def lin_dist(database, vector1=0, analysis_filename=0):
80
+ with open(database, "r") as f:
81
+ table = json.load(f)
82
+ lin_dist_to_1 = {}
83
+
84
+ if vector1 == 0:
85
+ vector1 = [0.025515518153991442 for _ in range(1536)]
86
+ elif vector1 == 1:
87
+ vector1 = table[str(list(table.keys())[0])]
88
+
89
+ for key in table.keys():
90
+ lin_dist_to_1[key] = np.linalg.norm(np.array(vector1) - np.array(table[key]))
91
+
92
+ my_1_writer.safe_my_dict_as_json(analysis_filename, lin_dist_to_1)
93
+ print("lin dist to 1 saved")
94
+
95
+
96
+ def manhattan_dist(database, vector1=0, analysis_filename=0):
97
+ with open(database, "r") as f:
98
+ table = json.load(f)
99
+ manhattan_dist_to_1 = {}
100
+
101
+ if vector1 == 0:
102
+ vector1 = [0.025515518153991442 for _ in range(1536)]
103
+ elif vector1 == 1:
104
+ vector1 = table[str(list(table.keys())[0])]
105
+
106
+ for key in table.keys():
107
+ manhattan_dist_to_1[key] = sum(np.array(vector1) - np.array(table[key]))
108
+
109
+ my_1_writer.safe_my_dict_as_json(analysis_filename, manhattan_dist_to_1)
110
+ print("manhattan dist to 1 saved")
111
+
112
+
113
+ #vec_table
114
+ def sim_search_fly(vec_table, term, debug=False):
115
+ if debug:
116
+ print(type(vec_table))
117
+ print(type(term))
118
+ print(type(vec_table[list(vec_table.keys())[0]]))
119
+ print("vec table:")
120
+ print(vec_table[list(vec_table.keys())[5]][:4])
121
+ print("search term")
122
+ print(term[:4])
123
+ if type(term) == str:
124
+ print("str")
125
+ vector1 = my_new_openai.vectorize_data(term)
126
+ elif type(term) == list:
127
+ print("list")
128
+ vector1 = term
129
+ else:
130
+ print("invalid search_term/search_vector format")
131
+ return
132
+
133
+ sim_search_dict = {}
134
+ for key in vec_table.keys():
135
+ vector2 = vec_table[key]
136
+ if debug:
137
+ print("")
138
+ print(f"{vector1}")
139
+ print(f"{vector2}")
140
+ print(f"doing dot product for {key} and {term}")
141
+ if vector2[0] == vector2[1] and vector2[3] == vector2[4] and vector2[5] == vector2[6]:
142
+ dp = 200
143
+ else:
144
+ dp = np.dot(vector1, vector2)
145
+ #distance = np.linalg.norm(np.array(vector1) - np.array(vector2))
146
+ if debug:
147
+ print(f"the dp is {dp}")
148
+ #print(f"the distance is{distance}")
149
+ print("")
150
+ print("")
151
+ print("")
152
+ sim_search_dict[key] = dp #* distance
153
+
154
+ # sort with the biggest similarity
155
+ sorted_table = dict(sorted(sim_search_dict.items(), key=lambda item: item[1]), reversed=True)
156
+
157
+ if debug:
158
+ for key, value in sorted_table[:5]:
159
+ print(f"{key}: {value}")
160
+
161
+ # first_key, first_value = list(sortedTable.items())[0]
162
+ print(f"the closest word to your input is: {list(sorted_table.keys())[0]}")
163
+ return sorted_table
my_new_openai.py → util/my_new_openai.py RENAMED
@@ -1,151 +1,151 @@
1
- import os
2
- from openai import OpenAI
3
- import requests
4
- import base64
5
-
6
- client = OpenAI()
7
-
8
-
9
- def image_bytes_to_base64(image_bytes):
10
- """
11
- Converts an image from bytes to a Base64 encoded string.
12
-
13
- Args:
14
- image_bytes (bytes): Byte content of the image.
15
-
16
- Returns:
17
- str: A Base64 encoded string of the image.
18
- """
19
- return base64.b64encode(image_bytes).decode('utf-8')
20
-
21
-
22
- def image_to_base64(image_path):
23
- with open(image_path, "rb") as image_file:
24
- return str(base64.b64encode(image_file.read()).decode('utf-8'))
25
-
26
-
27
- def gpt4_new(prompt_text):
28
- gpt_response = client.chat.completions.create(
29
- model="gpt-4",
30
- messages=[{"role": "system",
31
- "content": "Du bist eine Maschine, die Dokumente klassifiziert."},
32
- {"role": "user", "content": prompt_text}])
33
- return gpt_response.choices[0].message.content
34
-
35
-
36
- def vectorize_data(data_input):
37
- # input can be list or string:
38
-
39
- if isinstance(data_input, list):
40
- # returning a dictionary
41
- my_dict = {}
42
- for item in data_input:
43
- my_dict[str(item)] = client.embeddings.create(input=data_input,
44
- model="text-embedding-ada-002").data[0].embedding
45
- return my_dict
46
-
47
- elif isinstance(data_input, str):
48
- # returning just the vector
49
- return client.embeddings.create(input=data_input, model="text-embedding-ada-002").data[0].embedding
50
-
51
- else:
52
- print("none")
53
-
54
-
55
- def img_create(prompt="a nice house on the beach", download_path=""):
56
- # to open, must download
57
- my_url = client.images.generate(model="dall-e-3", prompt=prompt, size="1024x1024").data[0].url
58
- if download_path:
59
- my_image = requests.get(my_url)
60
- if my_image.status_code == 200:
61
- with open(download_path, 'wb') as f:
62
- f.write(my_image.content)
63
- else:
64
- print("Failed to retrieve image")
65
- return my_url
66
-
67
-
68
- def img_to_text(img_url="", img_base64="", prompt="What’s in this image?", print_out=True):
69
- if img_url:
70
- img_desc_response = client.chat.completions.create(
71
- model="gpt-4-turbo",
72
- messages=[
73
- {
74
- "role": "user",
75
- "content": [
76
- {"type": "text", "text": prompt},
77
- {
78
- "type": "image_url",
79
- "image_url": {
80
- "url": img_url,
81
- },
82
- },
83
- ],
84
- }
85
- ],
86
- max_tokens=500,
87
- )
88
- if print_out:
89
- print(img_desc_response.choices[0].message.content)
90
- return img_desc_response.choices[0].message.content
91
- elif img_base64:
92
- headers = {
93
- "Content-Type": "application/json",
94
- "Authorization": f"Bearer {os.environ['OPENAI_API_KEY']}"
95
- }
96
- payload = {
97
- "model": "gpt-4-turbo",
98
- "messages": [
99
- {
100
- "role": "user",
101
- "content": [
102
- {
103
- "type": "text",
104
- "text": prompt
105
- },
106
- {
107
- "type": "image_url",
108
- "image_url": {
109
- "url": f"data:image/jpeg;base64,{img_base64}"
110
- }
111
- }
112
- ]
113
- }
114
- ],
115
- "max_tokens": 300
116
- }
117
- img_desc_response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
118
- if print_out:
119
- print(img_desc_response.json()["choices"][0]["message"]["content"])
120
- return img_desc_response.json()["choices"][0]["message"]["content"]
121
- else:
122
- return ValueError
123
-
124
-
125
- def encode_image_to_base64(image_path):
126
- with open(image_path, "rb") as image_file:
127
- encoded_string = base64.b64encode(image_file.read()).decode('utf-8')
128
- return encoded_string
129
-
130
-
131
- def table_to_text(table=None, prompt="describe this table in plain text. "
132
- "be as precise as possible. spare no detail. "
133
- "what is in this table?", print_out=True):
134
- if table is not None:
135
- response = gpt4_new(f"{prompt} TABLE: {table}")
136
- if print_out:
137
- print(response)
138
- return response
139
- else:
140
- return ValueError
141
-
142
-
143
- if __name__ == "__main__":
144
- #print("here are all functions that directly call openai.")
145
- #img_create("a skier in the swiss alps", download_path="skier.png")
146
- #img_to_text(img_base64=encode_image_to_base64("skier.png"))
147
- #print(image_to_base64("skier.png"))
148
- #print(vectorize_data("test string"))
149
-
150
- print(gpt4_new())
151
-
 
1
+ import os
2
+ from openai import OpenAI
3
+ import requests
4
+ import base64
5
+
6
+ client = OpenAI()
7
+
8
+
9
+ def image_bytes_to_base64(image_bytes):
10
+ """
11
+ Converts an image from bytes to a Base64 encoded string.
12
+
13
+ Args:
14
+ image_bytes (bytes): Byte content of the image.
15
+
16
+ Returns:
17
+ str: A Base64 encoded string of the image.
18
+ """
19
+ return base64.b64encode(image_bytes).decode('utf-8')
20
+
21
+
22
+ def image_to_base64(image_path):
23
+ with open(image_path, "rb") as image_file:
24
+ return str(base64.b64encode(image_file.read()).decode('utf-8'))
25
+
26
+
27
+ def gpt4_new(prompt_text):
28
+ gpt_response = client.chat.completions.create(
29
+ model="gpt-4",
30
+ messages=[{"role": "system",
31
+ "content": "Du bist eine Maschine, die Dokumente klassifiziert."},
32
+ {"role": "user", "content": prompt_text}])
33
+ return gpt_response.choices[0].message.content
34
+
35
+
36
+ def vectorize_data(data_input):
37
+ # input can be list or string:
38
+
39
+ if isinstance(data_input, list):
40
+ # returning a dictionary
41
+ my_dict = {}
42
+ for item in data_input:
43
+ my_dict[str(item)] = client.embeddings.create(input=data_input,
44
+ model="text-embedding-ada-002").data[0].embedding
45
+ return my_dict
46
+
47
+ elif isinstance(data_input, str):
48
+ # returning just the vector
49
+ return client.embeddings.create(input=data_input, model="text-embedding-ada-002").data[0].embedding
50
+
51
+ else:
52
+ print("none")
53
+
54
+
55
+ def img_create(prompt="a nice house on the beach", download_path=""):
56
+ # to open, must download
57
+ my_url = client.images.generate(model="dall-e-3", prompt=prompt, size="1024x1024").data[0].url
58
+ if download_path:
59
+ my_image = requests.get(my_url)
60
+ if my_image.status_code == 200:
61
+ with open(download_path, 'wb') as f:
62
+ f.write(my_image.content)
63
+ else:
64
+ print("Failed to retrieve image")
65
+ return my_url
66
+
67
+
68
+ def img_to_text(img_url="", img_base64="", prompt="What’s in this image?", print_out=True):
69
+ if img_url:
70
+ img_desc_response = client.chat.completions.create(
71
+ model="gpt-4-turbo",
72
+ messages=[
73
+ {
74
+ "role": "user",
75
+ "content": [
76
+ {"type": "text", "text": prompt},
77
+ {
78
+ "type": "image_url",
79
+ "image_url": {
80
+ "url": img_url,
81
+ },
82
+ },
83
+ ],
84
+ }
85
+ ],
86
+ max_tokens=500,
87
+ )
88
+ if print_out:
89
+ print(img_desc_response.choices[0].message.content)
90
+ return img_desc_response.choices[0].message.content
91
+ elif img_base64:
92
+ headers = {
93
+ "Content-Type": "application/json",
94
+ "Authorization": f"Bearer {os.environ['OPENAI_API_KEY']}"
95
+ }
96
+ payload = {
97
+ "model": "gpt-4-turbo",
98
+ "messages": [
99
+ {
100
+ "role": "user",
101
+ "content": [
102
+ {
103
+ "type": "text",
104
+ "text": prompt
105
+ },
106
+ {
107
+ "type": "image_url",
108
+ "image_url": {
109
+ "url": f"data:image/jpeg;base64,{img_base64}"
110
+ }
111
+ }
112
+ ]
113
+ }
114
+ ],
115
+ "max_tokens": 300
116
+ }
117
+ img_desc_response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
118
+ if print_out:
119
+ print(img_desc_response.json()["choices"][0]["message"]["content"])
120
+ return img_desc_response.json()["choices"][0]["message"]["content"]
121
+ else:
122
+ return ValueError
123
+
124
+
125
+ def encode_image_to_base64(image_path):
126
+ with open(image_path, "rb") as image_file:
127
+ encoded_string = base64.b64encode(image_file.read()).decode('utf-8')
128
+ return encoded_string
129
+
130
+
131
+ def table_to_text(table=None, prompt="describe this table in plain text. "
132
+ "be as precise as possible. spare no detail. "
133
+ "what is in this table?", print_out=True):
134
+ if table is not None:
135
+ response = gpt4_new(f"{prompt} TABLE: {table}")
136
+ if print_out:
137
+ print(response)
138
+ return response
139
+ else:
140
+ return ValueError
141
+
142
+
143
+ if __name__ == "__main__":
144
+ #print("here are all functions that directly call openai.")
145
+ #img_create("a skier in the swiss alps", download_path="skier.png")
146
+ #img_to_text(img_base64=encode_image_to_base64("skier.png"))
147
+ #print(image_to_base64("skier.png"))
148
+ #print(vectorize_data("test string"))
149
+
150
+ print(gpt4_new())
151
+
my_vectors.py → util/my_vectors.py RENAMED
File without changes
setup_db.py → util/setup_db.py RENAMED
@@ -1,50 +1,50 @@
1
- import time
2
- import openpyxl
3
- import my_new_openai
4
-
5
-
6
- def update_excel_with_sums(filename):
7
- # Load the workbook and select the active worksheet
8
- workbook = openpyxl.load_workbook(filename)
9
- sheet = workbook.active
10
-
11
- # Iterate through each row in the sheet
12
- for row in sheet.iter_rows(min_row=1, min_col=2, max_col=3):
13
- Bn, Cn = row # Assuming B and C are columns 2 and 3 respectively
14
- vector = my_new_openai.vectorize_data(f"{Bn.value}: {Cn.value}") if Bn.value and Cn.value else 0
15
- if vector != 0:
16
- for val in vector:
17
- sheet.cell(row=Bn.row, column=4+vector.index(val)).value = val
18
-
19
- # Save the workbook
20
- workbook.save(filename)
21
- print(f"Updated the file '{filename}' with vectors in column D.")
22
-
23
-
24
- def load_vectorstore_from_excel(filename):
25
- # returns a dictonary
26
- # Load the workbook and select the active worksheet
27
- workbook = openpyxl.load_workbook(filename)
28
- sheet = workbook.active
29
-
30
- # Iterate through each row in the sheet
31
- vec_store = {}
32
- for row in range(3, 634):
33
- vec = []
34
- for col in range(0, 1536):
35
- val = sheet.cell(row=row, column=4+col).value
36
- vec.append(val)
37
- vec_store[str(sheet.cell(row=row, column=1).value)] = vec
38
- return vec_store
39
-
40
-
41
- if __name__ == '__main__':
42
- #update_excel_with_sums("KBOB_Klassifizierung.xlsx")
43
- t = time.time()
44
-
45
- vec_store = load_vectorstore_from_excel("KBOB_Klassifizierung.xlsx")
46
-
47
- print(time.time()-t)
48
- for e in vec_store.keys():
49
- print(f"{e}: {vec_store[e][0]}, {vec_store[e][1]}, .... {vec_store[e][-1]}")
50
-
 
1
+ import time
2
+ import openpyxl
3
+ import my_new_openai
4
+
5
+
6
+ def update_excel_with_sums(filename):
7
+ # Load the workbook and select the active worksheet
8
+ workbook = openpyxl.load_workbook(filename)
9
+ sheet = workbook.active
10
+
11
+ # Iterate through each row in the sheet
12
+ for row in sheet.iter_rows(min_row=1, min_col=2, max_col=3):
13
+ Bn, Cn = row # Assuming B and C are columns 2 and 3 respectively
14
+ vector = my_new_openai.vectorize_data(f"{Bn.value}: {Cn.value}") if Bn.value and Cn.value else 0
15
+ if vector != 0:
16
+ for val in vector:
17
+ sheet.cell(row=Bn.row, column=4+vector.index(val)).value = val
18
+
19
+ # Save the workbook
20
+ workbook.save(filename)
21
+ print(f"Updated the file '{filename}' with vectors in column D.")
22
+
23
+
24
+ def load_vectorstore_from_excel(filename):
25
+ # returns a dictonary
26
+ # Load the workbook and select the active worksheet
27
+ workbook = openpyxl.load_workbook(filename)
28
+ sheet = workbook.active
29
+
30
+ # Iterate through each row in the sheet
31
+ vec_store = {}
32
+ for row in range(3, 634):
33
+ vec = []
34
+ for col in range(0, 1536):
35
+ val = sheet.cell(row=row, column=4+col).value
36
+ vec.append(val)
37
+ vec_store[str(sheet.cell(row=row, column=1).value)] = vec
38
+ return vec_store
39
+
40
+
41
+ if __name__ == '__main__':
42
+ #update_excel_with_sums("KBOB_Klassifizierung.xlsx")
43
+ t = time.time()
44
+
45
+ vec_store = load_vectorstore_from_excel("../data/KBOB_Klassifizierung.xlsx")
46
+
47
+ print(time.time()-t)
48
+ for e in vec_store.keys():
49
+ print(f"{e}: {vec_store[e][0]}, {vec_store[e][1]}, .... {vec_store[e][-1]}")
50
+