elia-waefler commited on
Commit
c2b923e
·
verified ·
1 Parent(s): b863ba1

Upload 17 files

Browse files
Files changed (17) hide show
  1. .gitattributes +37 -35
  2. .gitignore +157 -0
  3. LICENSE +201 -0
  4. README.md +10 -13
  5. _IFC-checker.py +2 -0
  6. _ecodomus.py +40 -0
  7. _local_embeddings.py +2 -0
  8. app.py +309 -0
  9. ingest.py +126 -0
  10. kbob_file_handler.py +106 -0
  11. my_1_reader.py +201 -0
  12. my_1_writer.py +98 -0
  13. my_2_sim_search.py +163 -0
  14. my_new_openai.py +153 -0
  15. my_vectors.py +17 -0
  16. requirements.txt +15 -0
  17. setup_db.py +50 -0
.gitattributes CHANGED
@@ -1,35 +1,37 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ U3_alle/faiss_index.index/index.faiss filter=lfs diff=lfs merge=lfs -text
37
+ KBOB_Klassifizierung.xlsx filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib64/
18
+ parts/
19
+ sdist/
20
+ var/
21
+ wheels/
22
+ share/python-wheels/
23
+ *.egg-info/
24
+ .installed.cfg
25
+ *.egg
26
+ MANIFEST
27
+
28
+ # PyInstaller
29
+ # Usually these files are written by a python script from a template
30
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
31
+ *.manifest
32
+ *.spec
33
+
34
+ # Installer logs
35
+ pip-log.txt
36
+ pip-delete-this-directory.txt
37
+
38
+ # Unit test / coverage reports
39
+ htmlcov/
40
+ .tox/
41
+ .nox/
42
+ .coverage
43
+ .coverage.*
44
+ .cache
45
+ nosetests.xml
46
+ coverage.xml
47
+ *.cover
48
+ *.py,cover
49
+ .hypothesis/
50
+ .pytest_cache/
51
+ cover/
52
+
53
+ # Translations
54
+ *.mo
55
+ *.pot
56
+
57
+ # Django stuff:
58
+ *.log
59
+ local_settings.py
60
+ db.sqlite3
61
+ db.sqlite3-journal
62
+
63
+ # Flask stuff:
64
+ instance/
65
+ .webassets-cache
66
+
67
+ # Scrapy stuff:
68
+ .scrapy
69
+
70
+ # Sphinx documentation
71
+ docs/_build/
72
+
73
+ # PyBuilder
74
+ .pybuilder/
75
+ target/
76
+
77
+ # Jupyter Notebook
78
+ .ipynb_checkpoints
79
+
80
+ # IPython
81
+ profile_default/
82
+ ipython_config.py
83
+
84
+ # pyenv
85
+ # For a library or package, you might want to ignore these files since the code is
86
+ # intended to run in multiple environments; otherwise, check them in:
87
+ # .python-version
88
+
89
+ # pipenv
90
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
91
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
92
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
93
+ # install all needed dependencies.
94
+ #Pipfile.lock
95
+
96
+ # poetry
97
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
98
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
99
+ # commonly ignored for libraries.
100
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
101
+ #poetry.lock
102
+
103
+ # pdm
104
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
105
+ #pdm.lock
106
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
107
+ # in version control.
108
+ # https://pdm.fming.dev/#use-with-ide
109
+ .pdm.toml
110
+
111
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
112
+ __pypackages__/
113
+
114
+ # Celery stuff
115
+ celerybeat-schedule
116
+ celerybeat.pid
117
+
118
+ # SageMath parsed files
119
+ *.sage.py
120
+
121
+ # Environments
122
+ .env
123
+ env/
124
+ ENV/
125
+ env.bak/
126
+ venv.bak/
127
+
128
+ # Spyder project settings
129
+ .spyderproject
130
+ .spyproject
131
+
132
+ # Rope project settings
133
+ .ropeproject
134
+
135
+ # mkdocs documentation
136
+ /site
137
+
138
+ # mypy
139
+ .mypy_cache/
140
+ .dmypy.json
141
+ dmypy.json
142
+
143
+ # Pyre type checker
144
+ .pyre/
145
+
146
+ # pytype static type analyzer
147
+ .pytype/
148
+
149
+ # Cython debug symbols
150
+ cython_debug/
151
+
152
+ # PyCharm
153
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
154
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
155
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
156
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
157
+ #.idea/
LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
README.md CHANGED
@@ -1,13 +1,10 @@
1
- ---
2
- title: Ki Rag Classify
3
- emoji: 🦀
4
- colorFrom: gray
5
- colorTo: pink
6
- sdk: streamlit
7
- sdk_version: 1.34.0
8
- app_file: app.py
9
- pinned: false
10
- license: apache-2.0
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ ---
2
+ title: ki_inselspital
3
+ emoji: 🏥
4
+ colorFrom: yellow
5
+ colorTo: indigo
6
+ sdk: streamlit
7
+ sdk_version: 1.33.0
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
 
 
 
_IFC-checker.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # nicht aktuell
2
+ # my utils
_ecodomus.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import urllib.parse
3
+ import os
4
+
5
+ # Assuming environment variables are used to store sensitive data
6
+ client_id = 'Siemens.Advanta'
7
+ client_secret = os.environ.get('SIEMENS_API_KEY')
8
+ username = "I0340828"
9
+ password = os.environ["SIEMENS_EW_PW"]
10
+
11
+ # Endpoint
12
+ url = "https://eu-ecodomus-services.siemens.com/api/token HTTP/1.1"
13
+
14
+ # Data needs to be URL-encoded
15
+ data = {
16
+ 'client_id': client_id,
17
+ 'client_secret': client_secret,
18
+ 'username': username,
19
+ 'password': password,
20
+ 'grant_type': 'password'
21
+ }
22
+ encoded_data = urllib.parse.urlencode(data)
23
+ print(encoded_data)
24
+ # Headers
25
+ headers = {
26
+ 'Content-Type': 'application/x-www-form-urlencoded'
27
+ }
28
+
29
+ # POST Request
30
+ response = requests.post(url, data=encoded_data, headers=headers)
31
+
32
+ print("Status Code:", response.status_code)
33
+ print("Response Content:", response.text)
34
+ print("content", response.content)
35
+
36
+ if response.status_code == 200:
37
+ access_token = response.json().get('access_token')
38
+ print("Access Token:", access_token)
39
+ else:
40
+ print("Failed to fetch access token. Check response content for details.")
_local_embeddings.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ #nicht aktuell
2
+ # my Utils
app.py ADDED
@@ -0,0 +1,309 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ testing my own vectors
3
+
4
+ list comprehension whenever possible
5
+ main function
6
+ if name == main
7
+ reusable functions that do just one specific task
8
+ type checking
9
+ def my_function(in_one: str, in_two: int) -> None:
10
+ pip install mypy for static typechecking.
11
+
12
+ O Gebäudebetrieb
13
+ Reinigung
14
+
15
+
16
+ FM Prozesse nicht für klassifizierung
17
+ Phase auch nicht. IMMER 53!!
18
+
19
+ VISION: AUTOMATISCHE BENENNUNG BEI ECODOMUS UPLOAD
20
+ Automatische metadatenzuodrdnung
21
+
22
+
23
+
24
+
25
+ """
26
+ import json
27
+
28
+ import ingest
29
+ import my_1_writer
30
+ import my_2_sim_search
31
+ import my_vectors
32
+ import setup_db
33
+ import my_new_openai
34
+ import time
35
+ import streamlit as st
36
+ import os
37
+ from PIL import Image
38
+ import json
39
+ from typing import Any, Dict
40
+
41
+
42
+ def read_json_file(file_path: str) -> Dict[str, Any]:
43
+ """
44
+ Diese Funktion liest den Inhalt einer JSON-Datei und gibt ihn als Wörterbuch zurück.
45
+
46
+ Argumente:
47
+ file_path (str): Der Dateipfad zur JSON-Datei.
48
+
49
+ Rückgabewert:
50
+ Dict[str, Any]: Der Inhalt der JSON-Datei als DICT
51
+ ANY ist oft ein VECTOR = list[float]
52
+ """
53
+ try:
54
+ with open(file_path, 'r', encoding='utf-8') as file:
55
+ content = json.load(file)
56
+ return content
57
+ except Exception as e:
58
+ return {"error": str(e)}
59
+
60
+
61
+ #test this:
62
+ def extract_tables_from_page_advanced(page):
63
+ """Extrahiert einfache Tabellenstrukturen basierend auf Textblöcken einer Seite."""
64
+ text_blocks = page.get_text("blocks")
65
+ text_blocks = sorted(text_blocks, key=lambda block: (block[1], block[0])) # Nach Y, dann X sortieren
66
+
67
+ # Erstellen eines Histogramms der X-Startpunkte, um Spalten zu identifizieren
68
+ column_threshold = 10 # Mindestabstand zwischen verschiedenen Spalten
69
+ columns = {}
70
+ for block in text_blocks:
71
+ x_start = block[0]
72
+ found_column = False
73
+ for col in columns.keys():
74
+ if abs(x_start - col) < column_threshold:
75
+ columns[col].append(block)
76
+ found_column = True
77
+ break
78
+ if not found_column:
79
+ columns[x_start] = [block]
80
+
81
+ # Tabellenzeilen basierend auf den identifizierten Spalten extrahieren
82
+ tables = []
83
+ for col, blocks in columns.items():
84
+ table = []
85
+ for block in sorted(blocks, key=lambda block: block[1]): # Nach Y sortieren
86
+ table.append(block[4].strip()) # Text des Blocks hinzufügen
87
+ tables.append(table)
88
+
89
+ return tables
90
+
91
+
92
+
93
+ def merge_indices(index1, index2):
94
+ """
95
+ Merge two indices into a new index, assuming both are of the same type and dimensionality.
96
+ """
97
+ pass
98
+
99
+
100
+ def handle_userinput(user_question):
101
+ pass
102
+
103
+
104
+ def save_uploaded_file(uploaded_file):
105
+ try:
106
+ # Create a static folder if it doesn't exist
107
+ if not os.path.exists('static'):
108
+ os.makedirs('static')
109
+
110
+ # Write the uploaded file to a new file in the static directory
111
+ with open(os.path.join('static', uploaded_file.name), "wb") as f:
112
+ f.write(uploaded_file.getbuffer())
113
+ return True
114
+ except Exception as e:
115
+ print(e)
116
+ return False
117
+
118
+
119
+ def main():
120
+ st.set_page_config(page_title="Anna Seiler Haus KI-Assistent", page_icon=":hospital:")
121
+ if True:
122
+ if "conversation" not in sst:
123
+ sst.conversation = None
124
+ if "chat_history" not in sst:
125
+ sst.chat_history = None
126
+ if "page" not in sst:
127
+ sst.page = "home"
128
+ if "openai" not in sst:
129
+ sst.openai = True
130
+ if "login" not in sst:
131
+ sst.login = False
132
+ if 'submitted_user_query' not in sst:
133
+ sst.submitted_user_query = ''
134
+ if 'submitted_user_safe' not in sst:
135
+ sst.submitted_user_safe = ''
136
+ if 'submitted_user_load' not in sst:
137
+ sst.submitted_user_load = ''
138
+ if 'widget_user_load' not in sst:
139
+ sst.widget_user_load = 'U3_alle' # Init the vectorstore
140
+ if 'vectorstore' not in sst:
141
+ sst.vectorstore = None
142
+
143
+ def submit_user_query():
144
+ sst.submitted_user_query = sst.widget_user_query
145
+ sst.widget_user_query = ''
146
+
147
+ def submit_user_safe():
148
+ sst.submitted_user_safe = sst.widget_user_safe
149
+ sst.widget_user_safe = ''
150
+ if sst.vectorstore is not None:
151
+ my_vectors.save_local(sst.vectorstore, path=sst.submitted_user_safe)
152
+ st.sidebar.success("saved")
153
+ else:
154
+ st.sidebar.warning("No embeddings to save. Please process documents first.")
155
+
156
+ def submit_user_load():
157
+ sst.submitted_user_load = sst.widget_user_load
158
+ sst.widget_user_load = ''
159
+ if os.path.exists(sst.submitted_user_load):
160
+ new_db = my_vectors.load_local(f"{sst.submitted_user_load}/faiss_index.index")
161
+ if sst.vectorstore is not None:
162
+ if new_db is not None: # Check if this is working
163
+ st.sidebar.success("Vectors loaded")
164
+ else:
165
+ if new_db is not None: # Check if this is working
166
+ sst.vectorstore = new_db
167
+ st.sidebar.success("Vectors loaded")
168
+ else:
169
+ st.sidebar.warning("Couldn't load/find embeddings")
170
+
171
+ st.header("Anna Seiler Haus KI-Assistent ASH :hospital:")
172
+ if st.toggle("show README"):
173
+
174
+ st.subheader("Funktion: ")
175
+ st.write("dieses proof-of-concept von Elia Wäfler demonstriert das Potential von RAG (Retrival Augmented Generation) für BIM2FM Dokumentenablagen am Beispiel Dokumente U3 ASH (Anna Seiler Haus, Inselspital Bern). chatte mit den Dokumenten, oder lade selber ein oder mehrere PDF-Dokumente hoch, um RAG auszuprobieren. die vektoren werden lokal oder im st.session_state gespeichert. Feedback und Bugs gerne an elia.waefler@insel.ch")
176
+ st.write("Vielen Dank.")
177
+ st.write("")
178
+
179
+ st.subheader("Licence and credits")
180
+ st.write("THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.")
181
+ st.write("special thanks to OpenAI, STREAMLIT, HUGGINGFACE, LANGCHAIN and alejandro-ao")
182
+ l, r = st.columns(2)
183
+ with l:
184
+ st.subheader("Limitationen: ")
185
+ st.write("bisher nur Text aus PDFs")
186
+ st.write("macht Fehler, kann falsche Informationen geben")
187
+ st.write("prompts werden bisher nicht geprüft")
188
+ st.write("")
189
+ with r:
190
+ st.subheader("geplante Erweiterungen:")
191
+ st.write("Tabellen, Bilder werden auch vektorisiert, um die retrival qualität zu verbessern")
192
+ st.write("on premise anwendung mit mistral 7b oder vergleichbar")
193
+ st.write("Ecodomus API einbinden, um alle Dokumente einzubinden.")
194
+ st.write("")
195
+
196
+ if sst.login:
197
+ if st.toggle("RAG / classifier"):
198
+ #user_question = st.text_input("Ask a question about your documents:", key="user_query", on_change=handle_query)
199
+ st.text_input('Ask a question about your documents:', key='widget_user_query', on_change=submit_user_query)
200
+ #sst.openai = st.toggle(label="use openai?")
201
+ if sst.submitted_user_query:
202
+ if sst.vectorstore is not None:
203
+ handle_userinput(sst.submitted_user_query)
204
+ sst.submitted_user_query = False
205
+ else:
206
+ st.warning("no vectorstore loaded.")
207
+
208
+ with st.sidebar:
209
+ st.subheader("Your documents")
210
+ pdf_docs = st.file_uploader("Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
211
+ if st.button("Process"):
212
+ with st.spinner("Processing"):
213
+ vec = ingest.get_text_chunks(ingest.get_pdf_text(pdf_docs))
214
+ st.warning("only text")
215
+ sst.vectorstore = vec
216
+ sst.conversation = vec
217
+ st.success("embedding complete")
218
+ st.text_input('Safe Embeddings to: (copy path of folder)', key='widget_user_safe',
219
+ on_change=submit_user_safe)
220
+ st.text_input('Load Embeddings from: (copy path of folder)', key='widget_user_load',
221
+ on_change=submit_user_load)
222
+ if st.toggle("reset vectorstore?"):
223
+ if st.button("Yes, reset"):
224
+ sst.vectorstore = None
225
+ st.warning("vectorstore reset complete")
226
+ else:
227
+ st.warning("unsaved embeddings will be lost.")
228
+ else:
229
+ #vec_store = setup_db.load_vectorstore_from_excel("data/KBOB_Klassifizierung.xlsx")
230
+ #my_1_writer.safe_my_dict_as_json("data/KBOB_klassen_codes.json", vec_store)
231
+ vec_store = read_json_file("data/KBOB_klassen_codes.json")
232
+
233
+ sst.page = "home"
234
+ file = st.file_uploader("upload file", accept_multiple_files=False)
235
+ if st.button("classify me!"):
236
+ with st.spinner("Classifying..."):
237
+ query_vecs = []
238
+ if file.type == "application/pdf":
239
+ one, two, three, four, five = st.columns(5)
240
+ text = ingest.get_pdf_text(file)
241
+ with one:
242
+ st.success("text")
243
+ # ONE FILE ONLY OR MULTIPLE AT THE SAME TIME?
244
+ images = ingest.get_pdf_images(file.getvalue())
245
+ if type(images) != list:
246
+ images = [images]
247
+ for img in images:
248
+ text += my_new_openai.img_to_text(img_base64=my_new_openai.image_bytes_to_base64(img))
249
+ with two:
250
+ st.success("images")
251
+
252
+ tabs = ingest.get_pdf_tables(file.getvalue())
253
+
254
+ if type(tabs) != list:
255
+ tabs = [tabs]
256
+ for tab in tabs:
257
+ text += my_new_openai.table_to_text(table=tab)
258
+ with three:
259
+ st.success("tabs")
260
+
261
+ # ONE VECTOR PER PDF OR MULTIPLE (CHUNKS IMGS ...) IS THE QUESTION
262
+ full_search = my_new_openai.vectorize_data(text)
263
+ detail_search = [my_new_openai.vectorize_data(_) for _ in ingest.get_text_chunks(text)]
264
+ with four:
265
+ st.success("embedded document")
266
+ st.write(len(list(vec_store.keys())))
267
+ with one:
268
+ sorted_vec_table = my_2_sim_search.sim_search_fly(
269
+ vec_table=vec_store, term=full_search)
270
+ st.write(f"len of list of categories {len(list(sorted_vec_table.keys()))}")
271
+ st.write(f"the most fitting category is {next(iter(sorted_vec_table))}")
272
+ with two:
273
+ sorted_vecs_two = my_2_sim_search.sim_search_fly(
274
+ vec_table=read_json_file("vecs/Fachbereiche_vecs.json"), term=full_search)
275
+ st.write(f"len of list of categories {len(list(sorted_vecs_two.keys()))}")
276
+ st.write(f"the most fitting Fachbereich is {next(iter(sorted_vecs_two))}")
277
+ with three:
278
+ sorted_vecs_three = my_2_sim_search.sim_search_fly(
279
+ vec_table=read_json_file("vecs/SIA-PHASEN 1-5 OUTPUT_vecs.json"), term=full_search)
280
+ st.write(f"len of list of categories {len(list(sorted_vecs_three.keys()))}")
281
+ st.write(f"the most fitting SIA-Phase is {next(iter(sorted_vecs_three))}")
282
+ for vec in detail_search:
283
+ pass
284
+ with four:
285
+ st.success("classification complete")
286
+ else:
287
+ st.error()
288
+ else:
289
+ user_pw = st.text_input("ASK_ASH_PASSWORD: ", type="password")
290
+ if st.button("check"):
291
+ time.sleep(0.5)
292
+ if user_pw == ASK_ASH_PASSWORD:
293
+ sst.login = True
294
+ if "first_load" not in sst:
295
+ submit_user_load()
296
+ sst.first_load = True
297
+ st.rerun()
298
+
299
+
300
+
301
+ if __name__ == '__main__':
302
+ if True:
303
+ OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
304
+ OPENAI_ORG_ID = os.environ["OPENAI_ORG_ID"]
305
+ HUGGINGFACEHUB_API_TOKEN = os.environ["HUGGINGFACEHUB_API_TOKEN"]
306
+ sst = st.session_state
307
+ ASK_ASH_PASSWORD = os.environ["ASK_ASH_PASSWORD"]
308
+ main()
309
+
ingest.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PyPDF2 import PdfReader
2
+ from langchain.text_splitter import CharacterTextSplitter
3
+ import tabula
4
+ import io
5
+ import fitz # PyMuPDF
6
+ import pdfplumber
7
+ from PIL import Image
8
+ import io
9
+
10
+
11
+ def get_pdf_tables(pdf_bytes):
12
+ """
13
+ Extracts tables from a PDF file loaded directly from bytes.
14
+
15
+ Args:
16
+ pdf_bytes (bytes): The byte content of the PDF file.
17
+
18
+ Returns:
19
+ List[pandas.DataFrame]: A list of DataFrames, each representing a table extracted from the PDF.
20
+ """
21
+ tables = []
22
+ with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
23
+ for page in pdf.pages:
24
+ # Extract tables from the current page
25
+ page_tables = page.extract_tables()
26
+ for table in page_tables:
27
+ # Convert table to a DataFrame and append to the list
28
+ tables.append(table)
29
+
30
+ # Optionally convert lists of lists (tables) to pandas DataFrames
31
+ import pandas as pd
32
+ dataframes = [pd.DataFrame(table[1:], columns=table[0]) for table in tables if table]
33
+ return dataframes
34
+
35
+
36
+ def get_pdf_images(pdf_bytes):
37
+ """
38
+ Extracts images and captures screenshots of each page from a given PDF's bytes.
39
+
40
+ Args:
41
+ pdf_bytes (bytes): The byte content of the PDF file.
42
+
43
+ Returns:
44
+ List[bytes]: A list of image bytes extracted from the PDF, including screenshots of each page.
45
+ """
46
+ images = []
47
+ pdf_stream = io.BytesIO(pdf_bytes)
48
+ doc = fitz.open("pdf", pdf_stream.read())
49
+
50
+ for page_num, page in enumerate(doc):
51
+ # Take a screenshot of the current page
52
+ pix = page.get_pixmap() # This line captures the page as an image
53
+ img_bytes = pix.tobytes("png") # Save the pixmap as PNG bytes
54
+ images.append(img_bytes) # Append the screenshot to the list of images
55
+
56
+ # Extract embedded images
57
+ for img_index, img in enumerate(page.get_images(full=True)):
58
+ xref = img[0]
59
+ base_image = doc.extract_image(xref)
60
+ image_bytes = base_image["image"]
61
+ images.append(image_bytes)
62
+
63
+ doc.close()
64
+ return images
65
+
66
+
67
+ def get_pdf_old_tables(pdf_bytes):
68
+ """
69
+ Extracts tables from a given PDF's bytes using Tabula.
70
+ Args:
71
+ pdf_bytes (bytes): The byte content of the PDF file.
72
+
73
+ Returns:
74
+ List[pandas.DataFrame]: A list of DataFrames, each representing a table extracted from the PDF.
75
+ """
76
+ pdf_stream = io.BytesIO(pdf_bytes)
77
+ # Read PDF into list of DataFrame
78
+ tables = tabula.read_pdf(pdf_stream, pages='all', multiple_tables=True)
79
+ return tables
80
+
81
+
82
+ def get_pdf_text(pdf_docs):
83
+ text = ""
84
+ if type(pdf_docs) == list:
85
+ for pdf in pdf_docs:
86
+ pdf_reader = PdfReader(pdf)
87
+ for page in pdf_reader.pages:
88
+ text += page.extract_text()
89
+ else:
90
+ pdf_reader = PdfReader(pdf_docs)
91
+ for page in pdf_reader.pages:
92
+ text += page.extract_text()
93
+ return text
94
+
95
+
96
+ def get_text_chunks(text):
97
+ text_splitter = CharacterTextSplitter(
98
+ separator="\n",
99
+ chunk_size=1000,
100
+ chunk_overlap=200,
101
+ length_function=len
102
+ )
103
+ chunks = text_splitter.split_text(text)
104
+ return chunks
105
+
106
+
107
+ def extract_images_from_pdf_path(pdf_path):
108
+ doc = fitz.open(pdf_path)
109
+ images = []
110
+ for i in range(len(doc)):
111
+ for img in doc.get_page_images(i):
112
+ xref = img[0]
113
+ base = img[1]
114
+ img_data = doc.extract_image(xref)
115
+ img_bytes = img_data['image']
116
+
117
+ image = Image.open(io.BytesIO(img_bytes))
118
+ images.append(image)
119
+
120
+ return images
121
+
122
+
123
+ def get_tables_from_pdf_path(pdf_path):
124
+ # read_pdf will save the pdf table into Pandas Dataframe
125
+ tables = tabula.read_pdf(pdf_path, pages='all')
126
+ return tables
kbob_file_handler.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from typing import Any
3
+ from typing import List
4
+ import my_new_openai
5
+ import my_1_writer
6
+
7
+
8
+ def get_row_names(file_path: str, sheet_name="Block5", header=4, index_col=7) -> List[str]:
9
+ """
10
+ Diese Funktion liest alle Zeilennamen aus der angegebenen Excel-Datei und gibt sie als Liste von Zeichenketten zurück.
11
+
12
+ Argumente:
13
+ file_path (str): Der Dateipfad zur Excel-Datei.
14
+
15
+ Rückgabewert:
16
+ List[str]: Eine Liste von Zeilennamen als Zeichenketten.
17
+ """
18
+ try:
19
+ df = pd.read_excel(file_path, sheet_name=sheet_name, header=header, index_col=index_col)
20
+ return df.index.astype(str).tolist()
21
+ except Exception as e:
22
+ return [str(e)]
23
+
24
+
25
+ def get_col_names(file_path: str, sheet_name="Block5", header=4, index_col=7) -> List[str]:
26
+ """
27
+ Diese Funktion liest alle Spaltennamen aus der angegebenen Excel-Datei und gibt sie als Liste von Zeichenketten zurück.
28
+
29
+ Argumente:
30
+ file_path (str): Der Dateipfad zur Excel-Datei.
31
+
32
+ Rückgabewert:
33
+ List[str]: Eine Liste von Spaltennamen als Zeichenketten.
34
+ """
35
+ try:
36
+ df = pd.read_excel(file_path, sheet_name=sheet_name, header=header, index_col=index_col)
37
+ return df.columns.astype(str).tolist()
38
+ except Exception as e:
39
+ return [str(e)]
40
+
41
+
42
+ def get_cell_value(file_path: str, row_name: str, col_name: str, sheet_name="Block5") -> str:
43
+ """
44
+ Diese Funktion gibt den Wert der Zelle als Zeichenkette zurück, in der der angegebene Zeilenname und Spaltenname
45
+ zuerst gefunden wurden. Falls der Zeilenname oder Spaltenname nicht existiert, wird eine entsprechende
46
+ Fehlermeldung zurückgegeben.
47
+
48
+ Argumente:
49
+ file_path (str): Der Dateipfad zur Excel-Datei.
50
+ row_name (str): Der Name der Zeile.
51
+ col_name (str): Der Name der Spalte.
52
+
53
+ Rückgabewert:
54
+ str: Der Wert der Zelle als Zeichenkette, falls vorhanden. Andernfalls eine Fehlermeldung.
55
+ """
56
+ try:
57
+ df = pd.read_excel(file_path, sheet_name=sheet_name, header=4, index_col=8)
58
+ if row_name not in df.index:
59
+ return f"Zeilenname '{row_name}' nicht im DataFrame gefunden."
60
+ if col_name not in df.columns:
61
+ return f"Spaltenname '{col_name}' nicht im DataFrame gefunden."
62
+ return str(df.at[row_name, col_name])
63
+ except Exception as e:
64
+ return str(e)
65
+
66
+
67
+ def get_header_dict():
68
+ file_path = 'data/kbob.xlsx'
69
+ # print(get_cell_value(file_path, 'sdlöfjasko', 'Kältetechnik (D06)'))
70
+ rows = get_row_names(file_path)
71
+ cols = get_col_names(file_path)
72
+ colheads = get_col_names(file_path, header=3)
73
+ head = ""
74
+ my_cols = {}
75
+ for e in colheads:
76
+ if "Unnamed" not in e:
77
+ head = e
78
+ else:
79
+ colheads[colheads.index(e)] = head
80
+ print(colheads)
81
+ print(cols)
82
+ print(len(colheads), len(cols))
83
+ for i in range(len(colheads)):
84
+ if "Unnamed" not in cols[i]:
85
+ if colheads[i] not in list(my_cols.keys()):
86
+ my_cols[colheads[i]] = [cols[i]]
87
+ else:
88
+ my_cols[colheads[i]].append(cols[i])
89
+ print(my_cols)
90
+ print((len(list(my_cols.keys()))))
91
+ return my_cols
92
+
93
+
94
+ def create_kbob_vectors(headers: dict):
95
+ for e in list(headers.keys()):
96
+ header_list = [e + ": " + headers[e][_] for _ in range(len(headers[e]))]
97
+ print(f"{e}: {header_list}")
98
+ print()
99
+ vecs = my_new_openai.vectorize_data(header_list)
100
+ my_1_writer.safe_my_dict_as_json(f"vecs/{e}_vecs.json", vecs)
101
+
102
+
103
+ if __name__ == '__main__':
104
+ h = get_header_dict()
105
+ print(h.keys())
106
+ #create_kbob_vectors(h)
my_1_reader.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MUSS AUFGERÄUMT WERDEN
2
+
3
+ import json
4
+ import os
5
+ import subprocess
6
+ import PyPDF2
7
+ import csv
8
+ import fitz # PyMuPDF
9
+
10
+
11
+ def extract_text_from_pdf(pdf_path):
12
+ """
13
+ Extracts all text from a PDF file.
14
+
15
+ :param pdf_path: Path to the PDF file.
16
+ :return: Extracted text as a string.
17
+ """
18
+ # Open the PDF file
19
+ doc = fitz.open(pdf_path)
20
+
21
+ # Initialize an empty string to hold the text
22
+ text = ''
23
+
24
+ # Iterate through each page in the PDF
25
+ for page_num in range(len(doc)):
26
+ # Get a page
27
+ page = doc.load_page(page_num)
28
+
29
+ # Extract text from the page and add it to the result
30
+ text += page.get_text()
31
+
32
+ # Close the document
33
+ doc.close()
34
+
35
+ return text
36
+
37
+
38
+ def read_pdfs_from_folder(folder_path):
39
+ """
40
+ Reads all PDF files in the specified folder using PdfReader and extracts their text.
41
+
42
+ Parameters:
43
+ - folder_path: The path to the folder containing PDF files.
44
+
45
+ Returns:
46
+ - A dictionary with file names as keys and their extracted text as values.
47
+ """
48
+ pdf_texts = {}
49
+ for filename in os.listdir(folder_path):
50
+ if filename.endswith('.pdf'):
51
+ file_path = os.path.join(folder_path, filename)
52
+ with open(file_path, 'rb') as pdf_file:
53
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
54
+ text = ''
55
+ for page in pdf_reader.pages:
56
+ try:
57
+ text += page.extract_text()
58
+ except UnicodeDecodeError as e:
59
+ print(e)
60
+ for c in text:
61
+ if c in ["ä", "Ä"]:
62
+ text = text[:text.index(c)] + "ae" + text[text.index(c)+1:]
63
+ if c in ["ö", "Ö"]:
64
+ text = text[:text.index(c)] + "oe" + text[text.index(c)+1:]
65
+ if c in ["ü", "Ü"]:
66
+ text = text[:text.index(c)] + "ue" + text[text.index(c)+1:]
67
+ if c in [",", ";", "\\", '"']:
68
+ text = text[:text.index(c)] + "_" + text[text.index(c)+1:]
69
+ if c in ["/n", "\n"]:
70
+ text = text[:text.index(c)] + "<newline>" + text[text.index(c) + 1:]
71
+ pdf_texts[filename] = text
72
+ return pdf_texts
73
+
74
+
75
+ def read_csv_lines_as_strings(filename):
76
+ """
77
+ Opens a CSV file and returns each line as a string in a list.
78
+
79
+ Parameters:
80
+ - filename: The path to the CSV file.
81
+
82
+ Returns:
83
+ - A list of strings, each representing a line from the CSV file.
84
+ """
85
+ lines_as_strings = []
86
+ with open(filename, newline='') as csvfile:
87
+ try:
88
+ reader = csv.reader(csvfile)
89
+ for row in reader:
90
+ # Convert the row (a list of values) back into a comma-separated string
91
+ line_as_string = ','.join(row)
92
+ lines_as_strings.append(line_as_string)
93
+ except UnicodeDecodeError as e:
94
+ print(e)
95
+ return lines_as_strings
96
+
97
+
98
+ # Function to load data from JSON files
99
+ def load_data(filename):
100
+ with open(filename, 'r') as file:
101
+ try:
102
+ return json.load(file)
103
+ except UnicodeDecodeError as err:
104
+ print(err)
105
+ return {}
106
+
107
+
108
+ def find_and_open_file(filename, start_directory):
109
+ """
110
+ Attempts to open a file with the given filename starting from the specified directory.
111
+ If the file is not found, searches recursively in all subfolders. Works across macOS, Linux, and Windows.
112
+ """
113
+ for root, dirs, files in os.walk(start_directory):
114
+ if filename in files:
115
+ filepath = os.path.join(root, filename)
116
+ print(f"File found: {filepath}")
117
+ return filepath
118
+ print(f"File {filename} not found.")
119
+ return None
120
+
121
+
122
+ def open_file(filepath):
123
+ """
124
+ Opens the file with the default application, based on the operating system.
125
+ """
126
+ if os.path.exists(filepath):
127
+ if os.name == 'posix': # Linux, macOS, etc.
128
+ subprocess.call(('open', filepath))
129
+ elif os.name == 'nt': # Windows
130
+ os.startfile(filepath)
131
+ else:
132
+ print(f"Cannot open file on this operating system: {filepath}")
133
+ else:
134
+ print(f"File does not exist: {filepath}")
135
+
136
+
137
+ def list_folders_files_recursive(path, depth=0):
138
+ """
139
+ Recursively lists all folders and files within the specified path, including subfolders.
140
+
141
+ Parameters:
142
+ - path: The directory path to list contents from.
143
+ - depth: The current depth of recursion (used for indentation in print statements).
144
+
145
+ Returns:
146
+ - None
147
+ """
148
+ # Ensure the provided path is a directory
149
+ if not os.path.isdir(path):
150
+ print(f"The provided path '{path}' is not a valid directory.")
151
+ return
152
+
153
+ indent = ' ' * depth # Indentation based on recursion depth
154
+ folders, files = [], []
155
+
156
+ # List all entries in the directory
157
+ for entry in os.listdir(path):
158
+ full_path = os.path.join(path, entry)
159
+ if os.path.isdir(full_path):
160
+ folders.append(entry)
161
+ print(f"{indent}Folder: {entry}")
162
+ # Recursively list subfolders and files
163
+ list_folders_files_recursive(full_path, depth + 1)
164
+ elif os.path.isfile(full_path):
165
+ files.append(entry)
166
+
167
+ for f in files:
168
+ print(f"{indent}File: {f}")
169
+
170
+
171
+ def list_folders_files(path):
172
+ """
173
+ Lists all folders and files within the specified path.
174
+
175
+ Parameters:
176
+ - path: The directory path to list contents from.
177
+
178
+ Returns:
179
+ - A tuple of two lists: (folders, files).
180
+ """
181
+ folders = []
182
+ files = []
183
+
184
+ # Ensure the provided path is a directory
185
+ if not os.path.isdir(path):
186
+ print(f"The provided path '{path}' is not a valid directory.")
187
+ return folders, files
188
+
189
+ # List all entries in the directory
190
+ for entry in os.listdir(path):
191
+ full_path = os.path.join(path, entry)
192
+ if os.path.isdir(full_path):
193
+ folders.append(entry)
194
+ elif os.path.isfile(full_path):
195
+ files.append(entry)
196
+
197
+ return folders, files
198
+
199
+
200
+ if __name__ == "__main__":
201
+ print("here are all functions that read files")
my_1_writer.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MUSS AUFGERÄUMT WERDEN
2
+
3
+ import json
4
+ import pandas as pd
5
+
6
+
7
+ def split_json_file(input_filepath, lines_per_file=50):
8
+ """
9
+ Splits a JSON file into multiple files, each containing up to 'lines_per_file' lines.
10
+
11
+ param input_filepath: The path to the input JSON file.
12
+ param lines_per_file: The maximum number of lines per output file.
13
+ """
14
+ # Counter for file naming
15
+ file_counter = 1
16
+ # Open the input file
17
+ with open(input_filepath, 'r') as input_file:
18
+ # Read the lines from the input file
19
+ lines = input_file.readlines()
20
+ # Iterate through the lines in chunks of 'lines_per_file'
21
+ for i in range(0, len(lines), lines_per_file):
22
+ # Determine the output file name
23
+ output_filename = f'translate_data/english_{file_counter}.json'
24
+ # Write the current chunk to the output file
25
+ with open(output_filename, 'w') as output_file:
26
+ # Grab the current chunk of lines
27
+ chunk = lines[i:i+lines_per_file]
28
+ # Write each line to the output file
29
+ for line in chunk:
30
+ output_file.write(line)
31
+ print(f'Created {output_filename}')
32
+ # Increment the file counter
33
+ file_counter += 1
34
+
35
+
36
+ def merge_and_save(list1, list2, dict1, dict2, filename='output.csv'):
37
+ """
38
+ Merges two lists and two dictionaries into a pandas DataFrame according to the specified structure:
39
+ headers: ['list1', 'list2', 'keys dict1', 'vals dict1', 'keys dict2', 'vals dict2']
40
+ and saves it as a CSV file.
41
+
42
+ Parameters:
43
+ - list1 (list): First list to merge, contributing to column 'list1'.
44
+ - list2 (list): Second list to merge, contributing to column 'list2'.
45
+ - dict1 (dict): First dictionary to merge, keys and values added as separate columns.
46
+ - dict2 (dict): Second dictionary to merge, keys and values added as separate columns.
47
+ - filename (str): Filename for the saved CSV file.
48
+ """
49
+ # Combining all elements into a structured list of dictionaries for DataFrame construction
50
+ data = []
51
+ dict1_items = list(dict1.items())
52
+ dict2_items = list(dict2.items())
53
+ for i in range(len(list1)):
54
+ row = {
55
+ 'list1': list1[i],
56
+ 'list2': list2[i],
57
+ 'keys dict1': dict1_items[i][0],
58
+ 'vals dict1': dict1_items[i][1],
59
+ 'keys dict2': dict2_items[i][0],
60
+ 'vals dict2': dict2_items[i][1]
61
+ }
62
+ data.append(row)
63
+
64
+ # Creating the DataFrame
65
+ df = pd.DataFrame(data)
66
+
67
+ # Saving the DataFrame to a CSV file
68
+ df.to_csv(filename, index=False)
69
+ print(f"DataFrame saved as '{filename}' in the current directory.")
70
+
71
+
72
+ # new line for every entry
73
+ def safe_my_dict_as_json(file_name, my_dict):
74
+ print(my_dict)
75
+ # Open a file for writing
76
+ with open(file_name, 'w') as f:
77
+ # Write the opening brace of the JSON object
78
+ f.write('{\n')
79
+ # Get total number of items to control comma insertion
80
+ total_items = len(my_dict)
81
+ if type(my_dict) == list:
82
+ my_dict = my_dict[0]
83
+ # Iterate over items, keeping track of the current item index
84
+ for i, (key, value) in enumerate(my_dict.items()):
85
+ # Serialize the key with JSON to handle special characters and ensure proper quoting
86
+ json_key = json.dumps(key)
87
+ # Convert the list to a JSON-formatted string (without indentation)
88
+ json_value = json.dumps(value)
89
+ # Determine if a comma is needed (for all but the last item)
90
+ comma = ',' if i < total_items - 1 else ''
91
+ # Write the formatted string to the file
92
+ f.write(f" {json_key}: {json_value}{comma}\n")
93
+ # Write the closing brace of the JSON object
94
+ f.write('}\n')
95
+
96
+
97
+ if __name__ == "__main__":
98
+ print("here are all functions that write to the Datasets")
my_2_sim_search.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import my_new_openai
2
+ import my_1_writer
3
+ import json
4
+ import numpy as np
5
+
6
+
7
+ # sim search with dot_product and lin_distance
8
+ # the newly vectorized TERM will be added to the database
9
+ # database = .json file
10
+ def sim_search_load_db(database, term, add_to_db=True, debug=False):
11
+ if type(term) == str:
12
+ print("str")
13
+ vector1 = my_new_openai.vectorize_data(term)
14
+ elif type(term) == list:
15
+ print("list")
16
+ vector1 = term
17
+ else:
18
+ print("invalid search_term/search_vector format")
19
+ return
20
+ with open(database, "r") as f:
21
+ table = json.load(f)
22
+ sim_search_dict = {}
23
+ for key in table.keys():
24
+ vector2 = table[key]
25
+ if debug:
26
+ print("")
27
+ print(f"{vector1}")
28
+ print(f"{vector2}")
29
+ print(f"doing dot product for {key} and {term}")
30
+ dp = np.dot(vector1, vector2)
31
+ distance = np.linalg.norm(np.array(vector1) - np.array(vector2))
32
+ if debug:
33
+ print(f"the dp is {dp}")
34
+ print(f"the distance is{distance}")
35
+ print("")
36
+ print("")
37
+ print("")
38
+ sim_search_dict[key] = dp * distance
39
+
40
+ # sort with the biggest similarity
41
+ sorted_table = dict(sorted(sim_search_dict.items(), key=lambda item: item[1], reverse=True))
42
+
43
+ if debug:
44
+ for key, value in sorted_table[:5]:
45
+ print(f"{key}: {value}")
46
+ if add_to_db:
47
+
48
+ if term in table.keys():
49
+ print("the search term is in the database!")
50
+ # add the newly vectorized term to the words, if not already in the vector table
51
+ else:
52
+ if database != "session/my_words_vec_table.json":
53
+ database = "session/my_vecs.json"
54
+ # table = load_df(database) # ??
55
+ table[str(term)] = vector1
56
+ my_1_writer.safe_my_dict_as_json(database, table)
57
+ # first_key, first_value = list(sortedTable.items())[0]
58
+ print(f"the closest word to your input is: {list(sorted_table.keys())[0]}")
59
+ return sorted_table
60
+
61
+
62
+ def dot_p_to_1(database, vector1=0, analysis_filename=0):
63
+
64
+ with open(database, "r") as f:
65
+ table = json.load(f)
66
+ dot_product_to1 = {}
67
+
68
+ if vector1 == 0:
69
+ vector1 = [0.025515518153991442 for _ in range(1536)]
70
+ elif vector1 == 1:
71
+ vector1 = table[str(list(table.keys())[0])]
72
+
73
+ for key in table.keys():
74
+ dot_product_to1[key] = np.dot(vector1, table[key])
75
+ my_1_writer.safe_my_dict_as_json(analysis_filename, dot_product_to1)
76
+ print("dot p to 1 saved")
77
+
78
+
79
+ def lin_dist(database, vector1=0, analysis_filename=0):
80
+ with open(database, "r") as f:
81
+ table = json.load(f)
82
+ lin_dist_to_1 = {}
83
+
84
+ if vector1 == 0:
85
+ vector1 = [0.025515518153991442 for _ in range(1536)]
86
+ elif vector1 == 1:
87
+ vector1 = table[str(list(table.keys())[0])]
88
+
89
+ for key in table.keys():
90
+ lin_dist_to_1[key] = np.linalg.norm(np.array(vector1) - np.array(table[key]))
91
+
92
+ my_1_writer.safe_my_dict_as_json(analysis_filename, lin_dist_to_1)
93
+ print("lin dist to 1 saved")
94
+
95
+
96
+ def manhattan_dist(database, vector1=0, analysis_filename=0):
97
+ with open(database, "r") as f:
98
+ table = json.load(f)
99
+ manhattan_dist_to_1 = {}
100
+
101
+ if vector1 == 0:
102
+ vector1 = [0.025515518153991442 for _ in range(1536)]
103
+ elif vector1 == 1:
104
+ vector1 = table[str(list(table.keys())[0])]
105
+
106
+ for key in table.keys():
107
+ manhattan_dist_to_1[key] = sum(np.array(vector1) - np.array(table[key]))
108
+
109
+ my_1_writer.safe_my_dict_as_json(analysis_filename, manhattan_dist_to_1)
110
+ print("manhattan dist to 1 saved")
111
+
112
+
113
+ #vec_table
114
+ def sim_search_fly(vec_table, term, debug=False):
115
+ if debug:
116
+ print(type(vec_table))
117
+ print(type(term))
118
+ print(type(vec_table[list(vec_table.keys())[0]]))
119
+ print("vec table:")
120
+ print(vec_table[list(vec_table.keys())[5]][:4])
121
+ print("search term")
122
+ print(term[:4])
123
+ if type(term) == str:
124
+ print("str")
125
+ vector1 = my_new_openai.vectorize_data(term)
126
+ elif type(term) == list:
127
+ print("list")
128
+ vector1 = term
129
+ else:
130
+ print("invalid search_term/search_vector format")
131
+ return
132
+
133
+ sim_search_dict = {}
134
+ for key in vec_table.keys():
135
+ vector2 = vec_table[key]
136
+ if debug:
137
+ print("")
138
+ print(f"{vector1}")
139
+ print(f"{vector2}")
140
+ print(f"doing dot product for {key} and {term}")
141
+ if vector2[0] == vector2[1] and vector2[3] == vector2[4] and vector2[5] == vector2[6]:
142
+ dp = 200
143
+ else:
144
+ dp = np.dot(vector1, vector2)
145
+ #distance = np.linalg.norm(np.array(vector1) - np.array(vector2))
146
+ if debug:
147
+ print(f"the dp is {dp}")
148
+ #print(f"the distance is{distance}")
149
+ print("")
150
+ print("")
151
+ print("")
152
+ sim_search_dict[key] = dp #* distance
153
+
154
+ # sort with the biggest similarity
155
+ sorted_table = dict(sorted(sim_search_dict.items(), key=lambda item: item[1], reverse=True))
156
+
157
+ if debug:
158
+ for key, value in sorted_table[:5]:
159
+ print(f"{key}: {value}")
160
+
161
+ # first_key, first_value = list(sortedTable.items())[0]
162
+ print(f"the closest word to your input is: {list(sorted_table.keys())[0]}")
163
+ return sorted_table
my_new_openai.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #nicht aktuell
2
+
3
+ import os
4
+ from openai import OpenAI
5
+ import requests
6
+ import base64
7
+
8
+ client = OpenAI()
9
+
10
+
11
+ def image_bytes_to_base64(image_bytes):
12
+ """
13
+ Converts an image from bytes to a Base64 encoded string.
14
+
15
+ Args:
16
+ image_bytes (bytes): Byte content of the image.
17
+
18
+ Returns:
19
+ str: A Base64 encoded string of the image.
20
+ """
21
+ return base64.b64encode(image_bytes).decode('utf-8')
22
+
23
+
24
+ def image_to_base64(image_path):
25
+ with open(image_path, "rb") as image_file:
26
+ return str(base64.b64encode(image_file.read()).decode('utf-8'))
27
+
28
+
29
+ def gpt4_new(prompt_text):
30
+ gpt_response = client.chat.completions.create(
31
+ model="gpt-4",
32
+ messages=[{"role": "system",
33
+ "content": "Du bist eine Maschine, die Dokumente klassifiziert."},
34
+ {"role": "user", "content": prompt_text}])
35
+ return gpt_response.choices[0].message.content
36
+
37
+
38
+ def vectorize_data(data_input):
39
+ # input can be list or string:
40
+
41
+ if isinstance(data_input, list):
42
+ # returning a dictionary
43
+ my_dict = {}
44
+ for item in data_input:
45
+ my_dict[str(item)] = client.embeddings.create(input=data_input,
46
+ model="text-embedding-ada-002").data[0].embedding
47
+ return my_dict
48
+
49
+ elif isinstance(data_input, str):
50
+ # returning just the vector
51
+ return client.embeddings.create(input=data_input, model="text-embedding-ada-002").data[0].embedding
52
+
53
+ else:
54
+ print("none")
55
+
56
+
57
+ def img_create(prompt="a nice house on the beach", download_path=""):
58
+ # to open, must download
59
+ my_url = client.images.generate(model="dall-e-3", prompt=prompt, size="1024x1024").data[0].url
60
+ if download_path:
61
+ my_image = requests.get(my_url)
62
+ if my_image.status_code == 200:
63
+ with open(download_path, 'wb') as f:
64
+ f.write(my_image.content)
65
+ else:
66
+ print("Failed to retrieve image")
67
+ return my_url
68
+
69
+
70
+ def img_to_text(img_url="", img_base64="", prompt="What’s in this image?", print_out=True):
71
+ if img_url:
72
+ img_desc_response = client.chat.completions.create(
73
+ model="gpt-4-turbo",
74
+ messages=[
75
+ {
76
+ "role": "user",
77
+ "content": [
78
+ {"type": "text", "text": prompt},
79
+ {
80
+ "type": "image_url",
81
+ "image_url": {
82
+ "url": img_url,
83
+ },
84
+ },
85
+ ],
86
+ }
87
+ ],
88
+ max_tokens=500,
89
+ )
90
+ if print_out:
91
+ print(img_desc_response.choices[0].message.content)
92
+ return img_desc_response.choices[0].message.content
93
+ elif img_base64:
94
+ headers = {
95
+ "Content-Type": "application/json",
96
+ "Authorization": f"Bearer {os.environ['OPENAI_API_KEY']}"
97
+ }
98
+ payload = {
99
+ "model": "gpt-4-turbo",
100
+ "messages": [
101
+ {
102
+ "role": "user",
103
+ "content": [
104
+ {
105
+ "type": "text",
106
+ "text": prompt
107
+ },
108
+ {
109
+ "type": "image_url",
110
+ "image_url": {
111
+ "url": f"data:image/jpeg;base64,{img_base64}"
112
+ }
113
+ }
114
+ ]
115
+ }
116
+ ],
117
+ "max_tokens": 300
118
+ }
119
+ img_desc_response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
120
+ if print_out:
121
+ print(img_desc_response.json()["choices"][0]["message"]["content"])
122
+ return img_desc_response.json()["choices"][0]["message"]["content"]
123
+ else:
124
+ return ValueError
125
+
126
+
127
+ def encode_image_to_base64(image_path):
128
+ with open(image_path, "rb") as image_file:
129
+ encoded_string = base64.b64encode(image_file.read()).decode('utf-8')
130
+ return encoded_string
131
+
132
+
133
+ def table_to_text(table=None, prompt="describe this table in plain text. "
134
+ "be as precise as possible. spare no detail. "
135
+ "what is in this table?", print_out=True):
136
+ if table is not None:
137
+ response = gpt4_new(f"{prompt} TABLE: {table}")
138
+ if print_out:
139
+ print(response)
140
+ return response
141
+ else:
142
+ return ValueError
143
+
144
+
145
+ if __name__ == "__main__":
146
+ #print("here are all functions that directly call openai.")
147
+ #img_create("a skier in the swiss alps", download_path="skier.png")
148
+ #img_to_text(img_base64=encode_image_to_base64("skier.png"))
149
+ #print(image_to_base64("skier.png"))
150
+ #print(vectorize_data("test string"))
151
+
152
+ print(gpt4_new())
153
+
my_vectors.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ def safe_local(vectors, path):
4
+ pass
5
+
6
+
7
+ def merge_two(vec1, vec2):
8
+ pass
9
+
10
+
11
+ def load_local(path):
12
+ pass
13
+
14
+
15
+
16
+ if __name__ == "__main__":
17
+ print("you are in the my_vectors")
requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit~=1.33.0
2
+ bcrypt~=4.1.2
3
+ psycopg2-binary~=2.9.9
4
+ openai~=1.23.2
5
+ pypdf2~=3.0.1
6
+ langchain~=0.1.16
7
+ tiktoken~=0.6.0
8
+ numpy~=1.26.4
9
+ requests~=2.31.0
10
+ pandas~=2.2.2
11
+ tabula~=1.0.5
12
+ pdfplumber~=0.11.0
13
+ PyMuPDF~=1.24.3
14
+ pillow~=10.3.0
15
+ openpyxl~=3.1.2
setup_db.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import openpyxl
3
+ import my_new_openai
4
+
5
+
6
+ def update_excel_with_sums(filename):
7
+ # Load the workbook and select the active worksheet
8
+ workbook = openpyxl.load_workbook(filename)
9
+ sheet = workbook.active
10
+
11
+ # Iterate through each row in the sheet
12
+ for row in sheet.iter_rows(min_row=1, min_col=2, max_col=3):
13
+ Bn, Cn = row # Assuming B and C are columns 2 and 3 respectively
14
+ vector = my_new_openai.vectorize_data(f"{Bn.value}: {Cn.value}") if Bn.value and Cn.value else 0
15
+ if vector != 0:
16
+ for val in vector:
17
+ sheet.cell(row=Bn.row, column=4+vector.index(val)).value = val
18
+
19
+ # Save the workbook
20
+ workbook.save(filename)
21
+ print(f"Updated the file '{filename}' with vectors in column D.")
22
+
23
+
24
+ def load_vectorstore_from_excel(filename):
25
+ # returns a dictonary
26
+ # Load the workbook and select the active worksheet
27
+ workbook = openpyxl.load_workbook(filename)
28
+ sheet = workbook.active
29
+
30
+ # Iterate through each row in the sheet
31
+ vec_store = {}
32
+ for row in range(3, 634):
33
+ vec = []
34
+ for col in range(0, 1536):
35
+ val = sheet.cell(row=row, column=4+col).value
36
+ vec.append(val)
37
+ vec_store[str(sheet.cell(row=row, column=1).value)] = vec
38
+ return vec_store
39
+
40
+
41
+ if __name__ == '__main__':
42
+ #update_excel_with_sums("KBOB_Klassifizierung.xlsx")
43
+ t = time.time()
44
+
45
+ vec_store = load_vectorstore_from_excel("data/KBOB_Klassifizierung.xlsx")
46
+
47
+ print(time.time()-t)
48
+ for e in vec_store.keys():
49
+ print(f"{e}: {vec_store[e][0]}, {vec_store[e][1]}, .... {vec_store[e][-1]}")
50
+