elia-waefler commited on
Commit
e133364
·
verified ·
1 Parent(s): d55fd7c

Upload 14 files

Browse files
Files changed (14) hide show
  1. .gitattributes +37 -35
  2. .gitignore +160 -0
  3. KBOB_Klassifizierung.xlsx +3 -0
  4. LICENSE +201 -0
  5. README.md +10 -13
  6. app.py +218 -0
  7. ask_app.py +243 -0
  8. classify_app.py +197 -0
  9. ingest.py +130 -0
  10. my_1_reader.py +201 -0
  11. my_2_sim_search.py +164 -0
  12. my_new_openai.py +195 -0
  13. requirements.txt +10 -0
  14. setup_db.py +50 -0
.gitattributes CHANGED
@@ -1,35 +1,37 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ U3_alle/faiss_index.index/index.faiss filter=lfs diff=lfs merge=lfs -text
37
+ KBOB_Klassifizierung.xlsx filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/#use-with-ide
110
+ .pdm.toml
111
+
112
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113
+ __pypackages__/
114
+
115
+ # Celery stuff
116
+ celerybeat-schedule
117
+ celerybeat.pid
118
+
119
+ # SageMath parsed files
120
+ *.sage.py
121
+
122
+ # Environments
123
+ .env
124
+ .venv
125
+ env/
126
+ venv/
127
+ ENV/
128
+ env.bak/
129
+ venv.bak/
130
+
131
+ # Spyder project settings
132
+ .spyderproject
133
+ .spyproject
134
+
135
+ # Rope project settings
136
+ .ropeproject
137
+
138
+ # mkdocs documentation
139
+ /site
140
+
141
+ # mypy
142
+ .mypy_cache/
143
+ .dmypy.json
144
+ dmypy.json
145
+
146
+ # Pyre type checker
147
+ .pyre/
148
+
149
+ # pytype static type analyzer
150
+ .pytype/
151
+
152
+ # Cython debug symbols
153
+ cython_debug/
154
+
155
+ # PyCharm
156
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
159
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
+ #.idea/
KBOB_Klassifizierung.xlsx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd8797003fcbcfe6e52b7b8f2fcf56c8639e8807099f6af786bfddb5713a9761
3
+ size 13370614
LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
README.md CHANGED
@@ -1,13 +1,10 @@
1
- ---
2
- title: Ki Inselspital
3
- emoji: 🏆
4
- colorFrom: red
5
- colorTo: indigo
6
- sdk: streamlit
7
- sdk_version: 1.34.0
8
- app_file: app.py
9
- pinned: false
10
- license: apache-2.0
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ ---
2
+ title: ask-ASH
3
+ emoji: 🏥
4
+ colorFrom: yellow
5
+ colorTo: indigo
6
+ sdk: streamlit
7
+ sdk_version: 1.33.0
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
 
 
 
app.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ testing my own vectors
3
+ """
4
+ import ingest
5
+ import my_2_sim_search
6
+ import my_new_openai
7
+ import setup_db
8
+ import time
9
+ import streamlit as st
10
+ import os
11
+ import my_vectors
12
+
13
+
14
+ def merge_indices(index1, index2):
15
+ """
16
+ Merge two indices into a new index, assuming both are of the same type and dimensionality.
17
+ """
18
+ pass
19
+
20
+
21
+ def handle_userinput(user_question):
22
+ pass
23
+
24
+
25
+ def save_uploaded_file(uploaded_file):
26
+ try:
27
+ # Create a static folder if it doesn't exist
28
+ if not os.path.exists('static'):
29
+ os.makedirs('static')
30
+
31
+ # Write the uploaded file to a new file in the static directory
32
+ with open(os.path.join('static', uploaded_file.name), "wb") as f:
33
+ f.write(uploaded_file.getbuffer())
34
+ return True
35
+ except Exception as e:
36
+ print(e)
37
+ return False
38
+
39
+
40
+ def main():
41
+ st.set_page_config(page_title="Anna Seiler Haus KI-Assistent", page_icon=":hospital:")
42
+ if True:
43
+ if "conversation" not in sst:
44
+ sst.conversation = None
45
+ if "chat_history" not in sst:
46
+ sst.chat_history = None
47
+ if "page" not in sst:
48
+ sst.page = "home"
49
+ if "openai" not in sst:
50
+ sst.openai = True
51
+ if "login" not in sst:
52
+ sst.login = False
53
+ if 'submitted_user_query' not in sst:
54
+ sst.submitted_user_query = ''
55
+ if 'submitted_user_safe' not in sst:
56
+ sst.submitted_user_safe = ''
57
+ if 'submitted_user_load' not in sst:
58
+ sst.submitted_user_load = ''
59
+ if 'widget_user_load' not in sst:
60
+ sst.widget_user_load = 'U3_alle' # Init the vectorstore
61
+ if 'vectorstore' not in sst:
62
+ sst.vectorstore = None
63
+
64
+ def submit_user_query():
65
+ sst.submitted_user_query = sst.widget_user_query
66
+ sst.widget_user_query = ''
67
+
68
+ def submit_user_safe():
69
+ sst.submitted_user_safe = sst.widget_user_safe
70
+ sst.widget_user_safe = ''
71
+ if sst.vectorstore is not None:
72
+ my_vectors.save_local(sst.vectorstore, path=sst.submitted_user_safe)
73
+ st.sidebar.success("saved")
74
+ else:
75
+ st.sidebar.warning("No embeddings to save. Please process documents first.")
76
+
77
+ def submit_user_load():
78
+ sst.submitted_user_load = sst.widget_user_load
79
+ sst.widget_user_load = ''
80
+ if os.path.exists(sst.submitted_user_load):
81
+ new_db = my_vectors.load_local(f"{sst.submitted_user_load}/faiss_index.index")
82
+ if sst.vectorstore is not None:
83
+ if new_db is not None: # Check if this is working
84
+ st.sidebar.success("Vectors loaded")
85
+ else:
86
+ if new_db is not None: # Check if this is working
87
+ sst.vectorstore = new_db
88
+ st.sidebar.success("Vectors loaded")
89
+ else:
90
+ st.sidebar.warning("Couldn't load/find embeddings")
91
+
92
+ st.header("Anna Seiler Haus KI-Assistent ASH :hospital:")
93
+ if st.toggle("show README"):
94
+
95
+ st.subheader("Funktion: ")
96
+ st.write("dieses proof-of-concept von Elia Wäfler demonstriert das Potential von RAG (Retrival Augmented Generation) für BIM2FM Dokumentenablagen am Beispiel Dokumente U3 ASH (Anna Seiler Haus, Inselspital Bern). chatte mit den Dokumenten, oder lade selber ein oder mehrere PDF-Dokumente hoch, um RAG auszuprobieren. die vektoren werden lokal oder im st.session_state gespeichert. Feedback und Bugs gerne an elia.waefler@insel.ch")
97
+ st.write("Vielen Dank.")
98
+ st.write("")
99
+
100
+ st.subheader("Licence and credits")
101
+ st.write("THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.")
102
+ st.write("special thanks to OpenAI, STREAMLIT, HUGGINGFACE, LANGCHAIN and alejandro-ao")
103
+ l, r = st.columns(2)
104
+ with l:
105
+ st.subheader("Limitationen: ")
106
+ st.write("bisher nur Text aus PDFs")
107
+ st.write("macht Fehler, kann falsche Informationen geben")
108
+ st.write("prompts werden bisher nicht geprüft")
109
+ st.write("")
110
+ with r:
111
+ st.subheader("geplante Erweiterungen:")
112
+ st.write("Tabellen, Bilder werden auch vektorisiert, um die retrival qualität zu verbessern")
113
+ st.write("on premise anwendung mit mistral 7b oder vergleichbar")
114
+ st.write("Ecodomus API einbinden, um alle Dokumente einzubinden.")
115
+ st.write("")
116
+
117
+ if sst.login:
118
+ if st.toggle("RAG / classifier"):
119
+ #user_question = st.text_input("Ask a question about your documents:", key="user_query", on_change=handle_query)
120
+ st.text_input('Ask a question about your documents:', key='widget_user_query', on_change=submit_user_query)
121
+ #sst.openai = st.toggle(label="use openai?")
122
+ if sst.submitted_user_query:
123
+ if sst.vectorstore is not None:
124
+ handle_userinput(sst.submitted_user_query)
125
+ sst.submitted_user_query = False
126
+ else:
127
+ st.warning("no vectorstore loaded.")
128
+
129
+ with st.sidebar:
130
+ st.subheader("Your documents")
131
+ pdf_docs = st.file_uploader("Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
132
+ if st.button("Process"):
133
+ with st.spinner("Processing"):
134
+ vec = ingest.get_text_chunks(ingest.get_pdf_text(pdf_docs))
135
+ st.warning("only text")
136
+ sst.vectorstore = vec
137
+ sst.conversation = vec
138
+ st.success("embedding complete")
139
+ st.text_input('Safe Embeddings to: (copy path of folder)', key='widget_user_safe',
140
+ on_change=submit_user_safe)
141
+ st.text_input('Load Embeddings from: (copy path of folder)', key='widget_user_load',
142
+ on_change=submit_user_load)
143
+ if st.toggle("reset vectorstore?"):
144
+ if st.button("Yes, reset"):
145
+ sst.vectorstore = None
146
+ st.warning("vectorstore reset complete")
147
+ else:
148
+ st.warning("unsaved embeddings will be lost.")
149
+ else:
150
+ file = st.file_uploader("upload file", accept_multiple_files=False)
151
+ vec_store = setup_db.load_vectorstore_from_excel("KBOB_Klassifizierung.xlsx")
152
+ if st.button("classify me!"):
153
+ with st.spinner("Classifying..."):
154
+ query_vecs = []
155
+ if file.type == "application/pdf":
156
+ one, two, three, four, five = st.columns(5)
157
+ text = ingest.get_pdf_text(file)
158
+ with one:
159
+ st.success("text")
160
+ # ONE OR MULTIPLE IS THE QUESTION
161
+ imgs = ingest.get_pdf_images(file.getvalue())
162
+ if type(imgs) != list:
163
+ imgs = [imgs]
164
+ for img in imgs:
165
+ text += my_new_openai.img_to_text(img_base64=my_new_openai.image_bytes_to_base64(img))
166
+ with two:
167
+ st.success("imgs")
168
+
169
+ tabs = ingest.get_pdf_tables(file.getvalue())
170
+
171
+ if type(tabs) != list:
172
+ tabs = [tabs]
173
+ for tab in tabs:
174
+ text += my_new_openai.table_to_text(table=tab)
175
+ with three:
176
+ st.success("tabs")
177
+ full_search = my_new_openai.vectorize_data(text)
178
+ detail_search = [my_new_openai.vectorize_data(_) for _ in ingest.get_text_chunks(text)]
179
+ with four:
180
+ st.success("vecs")
181
+ st.write(len(list(vec_store.keys())))
182
+ sorted_vec_table = my_2_sim_search.sim_search_fly(vec_table=vec_store, term=full_search)
183
+ st.success("sim search")
184
+ st.write(f"len of list of categories {len(list(sorted_vec_table.keys()))}")
185
+ for category in list(sorted_vec_table.keys())[:3]:
186
+ st.write(category)
187
+ for category in list(sorted_vec_table.keys())[-3:]:
188
+ st.write(category)
189
+ for vec in detail_search:
190
+ pass
191
+ else:
192
+ st.error()
193
+ else:
194
+ user_pw = st.text_input("ASK_ASH_PASSWORD: ", type="password")
195
+ if st.button("check"):
196
+ time.sleep(0.5)
197
+ if user_pw == ASK_ASH_PASSWORD:
198
+ sst.login = True
199
+ if "first_load" not in sst:
200
+ submit_user_load()
201
+ sst.first_load = True
202
+ st.rerun()
203
+
204
+
205
+ if __name__ == '__main__':
206
+ if True:
207
+ BASE_URL = "https://api.vectara.io/v1"
208
+ OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
209
+ OPENAI_ORG_ID = os.environ["OPENAI_ORG_ID"]
210
+ PINECONE_API_KEY = os.environ["PINECONE_API_KEY_LCBIM"]
211
+ HUGGINGFACEHUB_API_TOKEN = os.environ["HUGGINGFACEHUB_API_TOKEN"]
212
+ VECTARA_API_KEY = os.environ["VECTARA_API_KEY"]
213
+ VECTARA_CUSTOMER_ID = os.environ["VECTARA_CUSTOMER_ID"]
214
+ headers = {"Authorization": f"Bearer {VECTARA_API_KEY}", "Content-Type": "application/json"}
215
+
216
+ sst = st.session_state
217
+ ASK_ASH_PASSWORD = os.environ["ASK_ASH_PASSWORD"]
218
+ main()
ask_app.py ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ complete, functional RAG App
3
+ stores vectors in session state, or locally.
4
+ add function to display retrieved documents
5
+ """
6
+
7
+ # import time
8
+ from datetime import datetime
9
+ # import openai
10
+ # import tiktoken
11
+ import streamlit as st
12
+ from PyPDF2 import PdfReader
13
+ from langchain.text_splitter import CharacterTextSplitter
14
+ from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
15
+ from langchain.vectorstores import FAISS
16
+ from langchain.chat_models import ChatOpenAI
17
+ from langchain.memory import ConversationBufferMemory
18
+ from langchain.chains import ConversationalRetrievalChain
19
+ from html_templates import css, bot_template, user_template
20
+ from langchain.llms import HuggingFaceHub
21
+ import os
22
+ import numpy as np
23
+ import faiss_utils
24
+ from langchain_community.vectorstores import FAISS
25
+ from langchain.embeddings import OpenAIEmbeddings
26
+
27
+
28
+ def merge_faiss_indices(index1, index2):
29
+ """
30
+ Merge two FAISS indices into a new index, assuming both are of the same type and dimensionality.
31
+
32
+ Args:
33
+ index1 (faiss.Index): The first FAISS index.
34
+ index2 (faiss.Index): The second FAISS index.
35
+
36
+ Returns:
37
+ faiss.Index: A new FAISS index containing all vectors from index1 and index2.
38
+ """
39
+
40
+ # Check if both indices are the same type
41
+ if type(index1) != type(index2):
42
+ raise ValueError("Indices are of different types")
43
+
44
+ # Check dimensionality
45
+ if index1.d != index2.d:
46
+ raise ValueError("Indices have different dimensionality")
47
+
48
+ # Determine type of indices
49
+ if isinstance(index1, FAISS.IndexFlatL2):
50
+ # Handle simple flat indices
51
+ d = index1.d
52
+ # Extract vectors from both indices
53
+ xb1 = FAISS.rev_swig_ptr(index1.xb.data(), index1.ntotal * d)
54
+ xb2 = FAISS.rev_swig_ptr(index2.xb.data(), index2.ntotal * d)
55
+
56
+ # Combine vectors
57
+ xb_combined = np.vstack((xb1, xb2))
58
+
59
+ # Create a new index and add combined vectors
60
+ new_index = FAISS.IndexFlatL2(d)
61
+ new_index.add(xb_combined)
62
+ return new_index
63
+
64
+ elif isinstance(index1, FAISS.IndexIVFFlat):
65
+ # Handle quantized indices (IndexIVFFlat)
66
+ d = index1.d
67
+ nlist = index1.nlist
68
+ quantizer = FAISS.IndexFlatL2(d) # Re-create the appropriate quantizer
69
+
70
+ # Create a new index with the same configuration
71
+ new_index = FAISS.IndexIVFFlat(quantizer, d, nlist, FAISS.METRIC_L2)
72
+
73
+ # If the indices are already trained, you can directly add the vectors
74
+ # Otherwise, you may need to train new_index using a representative subset of vectors
75
+ vecs1 = FAISS.rev_swig_ptr(index1.xb.data(), index1.ntotal * d)
76
+ vecs2 = FAISS.rev_swig_ptr(index2.xb.data(), index2.ntotal * d)
77
+ new_index.add(vecs1)
78
+ new_index.add(vecs2)
79
+ return new_index
80
+
81
+ else:
82
+ raise TypeError("Index type not supported for merging in this function")
83
+
84
+
85
+ def get_pdf_text(pdf_docs):
86
+ text = ""
87
+ for pdf in pdf_docs:
88
+ pdf_reader = PdfReader(pdf)
89
+ for page in pdf_reader.pages:
90
+ text += page.extract_text()
91
+ return text
92
+
93
+
94
+ def get_text_chunks(text):
95
+ text_splitter = CharacterTextSplitter(
96
+ separator="\n",
97
+ chunk_size=1000,
98
+ chunk_overlap=200,
99
+ length_function=len
100
+ )
101
+ chunks = text_splitter.split_text(text)
102
+ return chunks
103
+
104
+
105
+ def get_faiss_vectorstore(text_chunks):
106
+ if sst.openai:
107
+ my_embeddings = OpenAIEmbeddings()
108
+ else:
109
+ my_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
110
+ vectorstore = FAISS.from_texts(texts=text_chunks, embedding=my_embeddings)
111
+ return vectorstore
112
+
113
+
114
+ def get_conversation_chain(vectorstore):
115
+ if sst.openai:
116
+ llm = ChatOpenAI()
117
+ else:
118
+ llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature": 0.5, "max_length": 512})
119
+
120
+ memory = ConversationBufferMemory(
121
+ memory_key='chat_history', return_messages=True)
122
+ conversation_chain = ConversationalRetrievalChain.from_llm(
123
+ llm=llm,
124
+ retriever=vectorstore.as_retriever(),
125
+ memory=memory
126
+ )
127
+ return conversation_chain
128
+
129
+
130
+ def handle_userinput(user_question):
131
+ response = sst.conversation({'question': user_question})
132
+ sst.chat_history = response['chat_history']
133
+
134
+ for i, message in enumerate(sst.chat_history):
135
+ # Display user message
136
+ if i % 2 == 0:
137
+ st.write(user_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
138
+ else:
139
+ print(message)
140
+ # Display AI response
141
+ st.write(bot_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
142
+ # Display source document information if available in the message
143
+ if hasattr(message, 'source') and message.source:
144
+ st.write(f"Source Document: {message.source}", unsafe_allow_html=True)
145
+
146
+
147
+ if True:
148
+ BASE_URL = "https://api.vectara.io/v1"
149
+ OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
150
+ OPENAI_ORG_ID = os.environ["OPENAI_ORG_ID"]
151
+ PINECONE_API_KEY = os.environ["PINECONE_API_KEY_LCBIM"]
152
+ HUGGINGFACEHUB_API_TOKEN = os.environ["HUGGINGFACEHUB_API_TOKEN"]
153
+ VECTARA_API_KEY = os.environ["VECTARA_API_KEY"]
154
+ VECTARA_CUSTOMER_ID = os.environ["VECTARA_CUSTOMER_ID"]
155
+ headers = {"Authorization": f"Bearer {VECTARA_API_KEY}", "Content-Type": "application/json"}
156
+
157
+
158
+ def main():
159
+ st.set_page_config(page_title="Anna Seiler Haus KI-Assistent", page_icon=":hospital:")
160
+ st.write(css, unsafe_allow_html=True)
161
+ if "conversation" not in sst:
162
+ sst.conversation = None
163
+ if "chat_history" not in sst:
164
+ sst.chat_history = None
165
+ if "page" not in sst:
166
+ sst.page = "home"
167
+ if "openai" not in sst:
168
+ sst.openai = True
169
+ if "login" not in sst:
170
+ sst.login = False
171
+ if 'submitted_user_query' not in sst:
172
+ sst.submitted_user_query = ''
173
+ if 'submitted_user_safe' not in sst:
174
+ sst.submitted_user_safe = ''
175
+ if 'submitted_user_load' not in sst:
176
+ sst.submitted_user_load = ''
177
+
178
+ def submit_user_query():
179
+ sst.submitted_user_query = sst.widget_user_query
180
+ sst.widget_user_query = ''
181
+
182
+ def submit_user_safe():
183
+ sst.submitted_user_safe = sst.widget_user_safe
184
+ sst.widget_user_safe = ''
185
+ if "vectorstore" in sst:
186
+ # faiss_name = str(datetime.now().strftime("%Y%m%d%H%M%S")) + "faiss_index"
187
+ faiss_utils.save_local(sst.vectorstore, path=sst.submitted_user_safe)
188
+ st.sidebar.success("saved")
189
+ else:
190
+ st.sidebar.warning("No embeddings to save. Please process documents first.")
191
+
192
+ def submit_user_load():
193
+ sst.submitted_user_load = sst.widget_user_load
194
+ sst.widget_user_load = ''
195
+ if os.path.exists(sst.submitted_user_load):
196
+ new_db = faiss_utils.load_vectorstore(f"{sst.submitted_user_load}/faiss_index.index")
197
+ if "vectorstore" in sst:
198
+ if new_db is not None: # Check if this is working
199
+ sst.vectorstore.merge_from(new_db)
200
+ sst.conversation = get_conversation_chain(sst.vectorstore)
201
+ st.sidebar.success("faiss loaded")
202
+ else:
203
+ if new_db is not None: # Check if this is working
204
+ sst.vectorstore = new_db
205
+ sst.conversation = get_conversation_chain(new_db)
206
+ st.sidebar.success("faiss loaded")
207
+ else:
208
+ st.sidebar.warning("Couldn't load/find embeddings")
209
+
210
+ st.header("Anna Seiler Haus KI-Assistent ASH :hospital:")
211
+ if st.text_input("ASK_ASH_PASSWORD: ", type="password") == ASK_ASH_PASSWORD:
212
+
213
+ #user_question = st.text_input("Ask a question about your documents:", key="user_query", on_change=handle_query)
214
+ st.text_input('Ask a question about your documents:', key='widget_user_query', on_change=submit_user_query)
215
+ #sst.openai = st.toggle(label="use openai?")
216
+
217
+ if sst.submitted_user_query:
218
+ if "vectorstore" in sst:
219
+ handle_userinput(sst.submitted_user_query)
220
+ else:
221
+ st.warning("no vectorstore loaded.")
222
+
223
+ with st.sidebar:
224
+ st.subheader("Your documents")
225
+ pdf_docs = st.file_uploader("Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
226
+ if st.button("Process"):
227
+ with st.spinner("Processing"):
228
+ vec = get_faiss_vectorstore(get_text_chunks(get_pdf_text(pdf_docs)))
229
+ sst.vectorstore = vec
230
+ sst.conversation = get_conversation_chain(vec)
231
+ st.success("embedding complete")
232
+
233
+ st.text_input('Safe Embeddings to: (copy path of folder)', key='widget_user_safe',
234
+ on_change=submit_user_safe)
235
+
236
+ st.text_input('Load Embeddings from: (copy path of folder)', key='widget_user_load',
237
+ on_change=submit_user_load)
238
+
239
+
240
+ if __name__ == '__main__':
241
+ sst = st.session_state
242
+ ASK_ASH_PASSWORD = os.environ["ASK_ASH_PASSWORD"]
243
+ main()
classify_app.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ # import openai
4
+ from PyPDF2 import PdfReader
5
+ from openai import OpenAI
6
+ from langchain.chat_models import ChatOpenAI
7
+
8
+ ASK_ASH_PASSWORD = os.environ["ASK_ASH_PASSWORD"]
9
+
10
+
11
+ def gpt4_new(prompt_text):
12
+ client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))
13
+ response = client.chat.completions.create(
14
+ model="gpt-4",
15
+ messages=[{"role": "system",
16
+ "content": "Du bist eine Maschine, auf Grund des Texts von PDF-Dokumenten,"
17
+ "das Dokument in vorgegebene Kategorien klassifiziert."
18
+ "Du gibts möglichst kurze Antworten, am besten ein Wort"
19
+ "Du gibst keine Erklärungen oder Begründungen. "
20
+ "Du klassifizierst nur nach den vorgegebenen Kategorien."
21
+ "Wenn ein Dokument partout nicht klassifizierbar ist, "
22
+ "antwortest du mit '<no classification>'"},
23
+ {"role": "user", "content": prompt_text}])
24
+ return response.choices[0].message.content
25
+
26
+
27
+ # Define a function to ask a question to GPT-4
28
+ def ask_gpt4(question):
29
+ print(question) # we don't have to submit the question?
30
+ try:
31
+ # Use the chat function to send a message and get a response
32
+ response = ChatOpenAI()
33
+ # Extract the response text
34
+ return response["choices"][0]["message"]["content"]
35
+ except Exception as e:
36
+ # Handle exceptions that may occur during the API call
37
+ return str(e)
38
+
39
+
40
+ def process_prompts_and_save(my_prompts):
41
+ # Ensure the responses list is empty initially
42
+ responses = []
43
+
44
+ # Loop through each prompt in the list
45
+ for prompt in my_prompts:
46
+ try:
47
+ # ADD LOGIC TO READ FILE AND CLASSIFY
48
+ # Generate response for each prompt and append to the list
49
+ response = ask_gpt4(prompt)
50
+ sol = f"{prompt}\n\n{response}\n\n\n\n"
51
+ print(sol)
52
+ responses.append(sol)
53
+ except Exception as e:
54
+ # In case of an error, log the error with the prompt
55
+ responses.append(f"{prompt}\n\nError:{str(e)}\n\n\n\n")
56
+
57
+ # Writing all responses to a text file
58
+ with open('gpt4_responses.txt', 'w', encoding='utf-8') as file:
59
+ file.writelines(responses)
60
+
61
+
62
+ def get_pdfs_text(pdf_docs):
63
+ text = ""
64
+ for pdf in pdf_docs:
65
+ pdf_reader = PdfReader(pdf)
66
+ for page in pdf_reader.pages:
67
+ text += page.extract_text()
68
+ return text
69
+
70
+
71
+ def get_pdf_text(pdf_document):
72
+ text = ""
73
+ pdf_reader = PdfReader(pdf_document)
74
+ for page in pdf_reader.pages:
75
+ text += page.extract_text()
76
+ return text
77
+
78
+
79
+ def json_open(filename):
80
+ with open(filename, "r") as f:
81
+ mydata = f.read()
82
+ return mydata
83
+
84
+
85
+ def main():
86
+ st.title("Doc Classifier")
87
+ l, r = st.columns(2)
88
+ if st.toggle("show README"):
89
+ st.subheader("Funktion: ")
90
+ st.write("der Doc Classifier von Elia Wäfler kann einige der BIM2FM Dokumente")
91
+ st.write("des ASH nach Disziplin, Doc typ. und Geschoss (später KBOB) klassifizieren.")
92
+ st.write("lade ein oder mehrere PDF-Dokumente hoch, um es auszuprobieren.")
93
+ st.write("Feedback und Bugs gerne an elia.waefler@insel.ch")
94
+ st.write("Vielen Dank.")
95
+ st.write("")
96
+ with l:
97
+ st.subheader("Limitationen: ")
98
+ st.write("bisher nur PDFs")
99
+ st.write("nur Disziplin, Doc typ. und Geschoss")
100
+ st.write("macht teilweise Fehler, vor allem bei Koordination, Datennetz usw, (unklare Disziplinen)")
101
+ st.write("")
102
+ with r:
103
+ st.subheader("geplante Erweiterungen:")
104
+ st.write("Text Beschreibung wird von AI hinzugefügt")
105
+ st.write("jpg, bilder, tabellen, .xlsx, .docx alles möglich, nicht nur PDF/Text")
106
+ st.write("Ecodomus API einbinden, um alle Dokumente zu überprüfen.")
107
+
108
+ if st.text_input("ASK_ASH_PASSWORD: ", type="password") == ASK_ASH_PASSWORD:
109
+ uploaded_files = st.file_uploader("PDF Dokument", accept_multiple_files=True)
110
+ #print(uploaded_file)
111
+ #print(uploaded_file.name)
112
+
113
+ if st.button("classify KBOB!"):
114
+ if uploaded_files is not None:
115
+ with st.container():
116
+ # col1, col2, col3, col4, col5 = st.columns(5)
117
+ col1, col2, col3 = st.columns(3)
118
+ all_metadata = []
119
+ with col1:
120
+ st.write("Disziplin")
121
+ st.write(f"")
122
+ with col2:
123
+ st.write("Dokumententyp")
124
+ st.write(f"")
125
+ with col3:
126
+ st.write("Geschoss")
127
+ st.write(f"")
128
+
129
+ for file in uploaded_files:
130
+ metadata = [file.name]
131
+ with col1:
132
+ with st.spinner("GPT4 at work"):
133
+ pdf_text = str(get_pdf_text(file))
134
+ prompt_1 = auftrag_0 + auftrag_1_disziplin + str(Baubranchen_Disziplinen) + pdf_text
135
+ answer_1 = gpt4_new(prompt_1)
136
+ print(prompt_1)
137
+ metadata.append(answer_1)
138
+ st.write(answer_1)
139
+
140
+ with col2:
141
+ with st.spinner("GPT4 at work"):
142
+ prompt_2 = auftrag_0 + auftrag_1_type + str(Dokumententypen) + pdf_text
143
+ answer_2 = gpt4_new(prompt_2)
144
+ print(prompt_2)
145
+ metadata.append(answer_2)
146
+
147
+ st.write(answer_2)
148
+
149
+ with col3:
150
+ with st.spinner("GPT4 at work"):
151
+ prompt_3 = auftrag_0 + auftrag_1_ge + str(ASH_Geschosse) + pdf_text
152
+ answer_3 = gpt4_new(prompt_3)
153
+ print(prompt_3)
154
+ metadata.append(answer_2)
155
+
156
+ st.write(answer_3)
157
+
158
+ all_metadata.append(metadata)
159
+
160
+ metadata_filename = "ai_generated_metadata.txt"
161
+ with open(metadata_filename, 'w', encoding='utf-8') as f:
162
+ for line in all_metadata:
163
+ f.writelines("\n")
164
+ for item in line:
165
+ f.writelines(item)
166
+ f.writelines(";")
167
+
168
+ f.writelines("\n")
169
+
170
+ st.success("classified, saved")
171
+ st.download_button(f"Download Metadata", json_open(metadata_filename), file_name=metadata_filename)
172
+ else:
173
+ st.warning("no file")
174
+
175
+
176
+ if __name__ == "__main__":
177
+ #prompts = ["classify the document, tell me the ", "hello"]
178
+ #process_prompts_and_save(prompts)
179
+ auftrag_0 = "Klassifiziere dieses Dokument nach "
180
+ auftrag_1_disziplin = "diesen 'Baubranchen Disziplinen': "
181
+ auftrag_1_type = "diesen 'Dokumententypen': "
182
+ auftrag_1_ge = "diesen 'Geschossen': "
183
+ Baubranchen_Disziplinen = ['A-Architektur', 'B-Bauphysik', 'C-Rohrpostanlagen', 'D-Datennetz', 'E-Elektroanlagen',
184
+ 'F-Fassadenplanung', 'G-Küche', 'H-Heizung', 'I-Innenausbau', 'K-Kälte', 'L-Lüftung',
185
+ 'M-Medizintechnik', 'N-Fördertechnik', 'O-Gebäudebetrieb', 'P-Sprinkler',
186
+ 'Q-Brandschutz', 'R-Koordination', 'S-Sanitär', 'T-Tragwerksplanung', 'W-Informatik',
187
+ 'Z-Lichtplanung']
188
+ auftrag_2 = "gib nur den am besten passendsten Eintrag zurück. " \
189
+ "Keine weiteren Ausführungen oder Erklärungen. " \
190
+ "Antworte am besten in einem Wort. " \
191
+ "Hier der Dokumenteninhalt: "
192
+ Dokumententypen = ['Fotodokumentation', 'Projektdokumentation (PD)', 'Objektdokumentation (OD)',
193
+ 'Prozessdokumentation', 'Fachdokumentation', 'Anlagedokumentation']
194
+ ASH_Geschosse = ['U4', 'U3', 'U2', 'U1',
195
+ 'A', 'B', 'C', 'D', 'E', 'F', 'G']
196
+ #print(str(Baubranchen_Disziplinen))
197
+ main()
ingest.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PyPDF2 import PdfReader
2
+ from langchain.text_splitter import CharacterTextSplitter
3
+ import tabula
4
+ import io
5
+ import fitz # PyMuPDF
6
+ import pdfplumber
7
+
8
+
9
+ def get_pdf_tables(pdf_bytes):
10
+ """
11
+ Extracts tables from a PDF file loaded directly from bytes.
12
+
13
+ Args:
14
+ pdf_bytes (bytes): The byte content of the PDF file.
15
+
16
+ Returns:
17
+ List[pandas.DataFrame]: A list of DataFrames, each representing a table extracted from the PDF.
18
+ """
19
+ tables = []
20
+ with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
21
+ for page in pdf.pages:
22
+ # Extract tables from the current page
23
+ page_tables = page.extract_tables()
24
+ for table in page_tables:
25
+ # Convert table to a DataFrame and append to the list
26
+ tables.append(table)
27
+
28
+ # Optionally convert lists of lists (tables) to pandas DataFrames
29
+ import pandas as pd
30
+ dataframes = [pd.DataFrame(table[1:], columns=table[0]) for table in tables if table]
31
+ return dataframes
32
+
33
+
34
+ def get_pdf_images(pdf_bytes):
35
+ """
36
+ Extracts images and captures screenshots of each page from a given PDF's bytes.
37
+
38
+ Args:
39
+ pdf_bytes (bytes): The byte content of the PDF file.
40
+
41
+ Returns:
42
+ List[bytes]: A list of image bytes extracted from the PDF, including screenshots of each page.
43
+ """
44
+ images = []
45
+ pdf_stream = io.BytesIO(pdf_bytes)
46
+ doc = fitz.open("pdf", pdf_stream.read())
47
+
48
+ for page_num, page in enumerate(doc):
49
+ # Take a screenshot of the current page
50
+ pix = page.get_pixmap() # This line captures the page as an image
51
+ img_bytes = pix.tobytes("png") # Save the pixmap as PNG bytes
52
+ images.append(img_bytes) # Append the screenshot to the list of images
53
+
54
+ # Extract embedded images
55
+ for img_index, img in enumerate(page.get_images(full=True)):
56
+ xref = img[0]
57
+ base_image = doc.extract_image(xref)
58
+ image_bytes = base_image["image"]
59
+ images.append(image_bytes)
60
+
61
+ doc.close()
62
+ return images
63
+
64
+
65
+ def get_pdf_old_tables(pdf_bytes):
66
+ """
67
+ Extracts tables from a given PDF's bytes using Tabula.
68
+ Args:
69
+ pdf_bytes (bytes): The byte content of the PDF file.
70
+
71
+ Returns:
72
+ List[pandas.DataFrame]: A list of DataFrames, each representing a table extracted from the PDF.
73
+ """
74
+ pdf_stream = io.BytesIO(pdf_bytes)
75
+ # Read PDF into list of DataFrame
76
+ tables = tabula.read_pdf(pdf_stream, pages='all', multiple_tables=True)
77
+ return tables
78
+
79
+
80
+ def get_pdf_text(pdf_docs):
81
+ text = ""
82
+ if type(pdf_docs) == list:
83
+ for pdf in pdf_docs:
84
+ pdf_reader = PdfReader(pdf)
85
+ for page in pdf_reader.pages:
86
+ text += page.extract_text()
87
+ else:
88
+ pdf_reader = PdfReader(pdf_docs)
89
+ for page in pdf_reader.pages:
90
+ text += page.extract_text()
91
+ return text
92
+
93
+
94
+ def get_text_chunks(text):
95
+ text_splitter = CharacterTextSplitter(
96
+ separator="\n",
97
+ chunk_size=1000,
98
+ chunk_overlap=200,
99
+ length_function=len
100
+ )
101
+ chunks = text_splitter.split_text(text)
102
+ return chunks
103
+
104
+
105
+ def extract_images_from_pdf_path(pdf_path):
106
+ doc = fitz.open(pdf_path)
107
+ images = []
108
+ for i in range(len(doc)):
109
+ for img in doc.get_page_images(i):
110
+ xref = img[0]
111
+ base = img[1]
112
+ img_data = doc.extract_image(xref)
113
+ img_bytes = img_data['image']
114
+
115
+ # open the image with PIL
116
+ from PIL import Image
117
+ import io
118
+ image = Image.open(io.BytesIO(img_bytes))
119
+ images.append(image)
120
+
121
+ return images
122
+
123
+
124
+ def get_tables_from_pdf_path(pdf_path):
125
+ # read_pdf will save the pdf table into Pandas Dataframe
126
+ tables = tabula.read_pdf(pdf_path, pages='all')
127
+ return tables
128
+
129
+
130
+ print(get_pdf_text("ISB-020-U3-W-E-01-B15100-005-000.pdf"))
my_1_reader.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MUSS AUFGERÄUMT WERDEN
2
+
3
+ import json
4
+ import os
5
+ import subprocess
6
+ import PyPDF2
7
+ import csv
8
+ import fitz # PyMuPDF
9
+
10
+
11
+ def extract_text_from_pdf(pdf_path):
12
+ """
13
+ Extracts all text from a PDF file.
14
+
15
+ :param pdf_path: Path to the PDF file.
16
+ :return: Extracted text as a string.
17
+ """
18
+ # Open the PDF file
19
+ doc = fitz.open(pdf_path)
20
+
21
+ # Initialize an empty string to hold the text
22
+ text = ''
23
+
24
+ # Iterate through each page in the PDF
25
+ for page_num in range(len(doc)):
26
+ # Get a page
27
+ page = doc.load_page(page_num)
28
+
29
+ # Extract text from the page and add it to the result
30
+ text += page.get_text()
31
+
32
+ # Close the document
33
+ doc.close()
34
+
35
+ return text
36
+
37
+
38
+ def read_pdfs_from_folder(folder_path):
39
+ """
40
+ Reads all PDF files in the specified folder using PdfReader and extracts their text.
41
+
42
+ Parameters:
43
+ - folder_path: The path to the folder containing PDF files.
44
+
45
+ Returns:
46
+ - A dictionary with file names as keys and their extracted text as values.
47
+ """
48
+ pdf_texts = {}
49
+ for filename in os.listdir(folder_path):
50
+ if filename.endswith('.pdf'):
51
+ file_path = os.path.join(folder_path, filename)
52
+ with open(file_path, 'rb') as pdf_file:
53
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
54
+ text = ''
55
+ for page in pdf_reader.pages:
56
+ try:
57
+ text += page.extract_text()
58
+ except UnicodeDecodeError as e:
59
+ print(e)
60
+ for c in text:
61
+ if c in ["ä", "Ä"]:
62
+ text = text[:text.index(c)] + "ae" + text[text.index(c)+1:]
63
+ if c in ["ö", "Ö"]:
64
+ text = text[:text.index(c)] + "oe" + text[text.index(c)+1:]
65
+ if c in ["ü", "Ü"]:
66
+ text = text[:text.index(c)] + "ue" + text[text.index(c)+1:]
67
+ if c in [",", ";", "\\", '"']:
68
+ text = text[:text.index(c)] + "_" + text[text.index(c)+1:]
69
+ if c in ["/n", "\n"]:
70
+ text = text[:text.index(c)] + "<newline>" + text[text.index(c) + 1:]
71
+ pdf_texts[filename] = text
72
+ return pdf_texts
73
+
74
+
75
+ def read_csv_lines_as_strings(filename):
76
+ """
77
+ Opens a CSV file and returns each line as a string in a list.
78
+
79
+ Parameters:
80
+ - filename: The path to the CSV file.
81
+
82
+ Returns:
83
+ - A list of strings, each representing a line from the CSV file.
84
+ """
85
+ lines_as_strings = []
86
+ with open(filename, newline='') as csvfile:
87
+ try:
88
+ reader = csv.reader(csvfile)
89
+ for row in reader:
90
+ # Convert the row (a list of values) back into a comma-separated string
91
+ line_as_string = ','.join(row)
92
+ lines_as_strings.append(line_as_string)
93
+ except UnicodeDecodeError as e:
94
+ print(e)
95
+ return lines_as_strings
96
+
97
+
98
+ # Function to load data from JSON files
99
+ def load_data(filename):
100
+ with open(filename, 'r') as file:
101
+ try:
102
+ return json.load(file)
103
+ except UnicodeDecodeError as err:
104
+ print(err)
105
+ return {}
106
+
107
+
108
+ def find_and_open_file(filename, start_directory):
109
+ """
110
+ Attempts to open a file with the given filename starting from the specified directory.
111
+ If the file is not found, searches recursively in all subfolders. Works across macOS, Linux, and Windows.
112
+ """
113
+ for root, dirs, files in os.walk(start_directory):
114
+ if filename in files:
115
+ filepath = os.path.join(root, filename)
116
+ print(f"File found: {filepath}")
117
+ return filepath
118
+ print(f"File {filename} not found.")
119
+ return None
120
+
121
+
122
+ def open_file(filepath):
123
+ """
124
+ Opens the file with the default application, based on the operating system.
125
+ """
126
+ if os.path.exists(filepath):
127
+ if os.name == 'posix': # Linux, macOS, etc.
128
+ subprocess.call(('open', filepath))
129
+ elif os.name == 'nt': # Windows
130
+ os.startfile(filepath)
131
+ else:
132
+ print(f"Cannot open file on this operating system: {filepath}")
133
+ else:
134
+ print(f"File does not exist: {filepath}")
135
+
136
+
137
+ def list_folders_files_recursive(path, depth=0):
138
+ """
139
+ Recursively lists all folders and files within the specified path, including subfolders.
140
+
141
+ Parameters:
142
+ - path: The directory path to list contents from.
143
+ - depth: The current depth of recursion (used for indentation in print statements).
144
+
145
+ Returns:
146
+ - None
147
+ """
148
+ # Ensure the provided path is a directory
149
+ if not os.path.isdir(path):
150
+ print(f"The provided path '{path}' is not a valid directory.")
151
+ return
152
+
153
+ indent = ' ' * depth # Indentation based on recursion depth
154
+ folders, files = [], []
155
+
156
+ # List all entries in the directory
157
+ for entry in os.listdir(path):
158
+ full_path = os.path.join(path, entry)
159
+ if os.path.isdir(full_path):
160
+ folders.append(entry)
161
+ print(f"{indent}Folder: {entry}")
162
+ # Recursively list subfolders and files
163
+ list_folders_files_recursive(full_path, depth + 1)
164
+ elif os.path.isfile(full_path):
165
+ files.append(entry)
166
+
167
+ for f in files:
168
+ print(f"{indent}File: {f}")
169
+
170
+
171
+ def list_folders_files(path):
172
+ """
173
+ Lists all folders and files within the specified path.
174
+
175
+ Parameters:
176
+ - path: The directory path to list contents from.
177
+
178
+ Returns:
179
+ - A tuple of two lists: (folders, files).
180
+ """
181
+ folders = []
182
+ files = []
183
+
184
+ # Ensure the provided path is a directory
185
+ if not os.path.isdir(path):
186
+ print(f"The provided path '{path}' is not a valid directory.")
187
+ return folders, files
188
+
189
+ # List all entries in the directory
190
+ for entry in os.listdir(path):
191
+ full_path = os.path.join(path, entry)
192
+ if os.path.isdir(full_path):
193
+ folders.append(entry)
194
+ elif os.path.isfile(full_path):
195
+ files.append(entry)
196
+
197
+ return folders, files
198
+
199
+
200
+ if __name__ == "__main__":
201
+ print("here are all functions that read files")
my_2_sim_search.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import my_new_openai
2
+ import my_1_writer
3
+ import json
4
+ import numpy as np
5
+
6
+
7
+ # sim search with dot_product and lin_distance
8
+ # the newly vectorized TERM will be added to the database
9
+ # database = .json file
10
+ def sim_search_load_db(database, term, add_to_db=True, debug=False):
11
+ if type(term) == str:
12
+ print("str")
13
+ vector1 = my_new_openai.vectorize_data(term)
14
+ elif type(term) == list:
15
+ print("list")
16
+ vector1 = term
17
+ else:
18
+ print("invalid search_term/search_vector format")
19
+ return
20
+ with open(database, "r") as f:
21
+ table = json.load(f)
22
+ sim_search_dict = {}
23
+ for key in table.keys():
24
+ vector2 = table[key]
25
+ if debug:
26
+ print("")
27
+ print(f"{vector1}")
28
+ print(f"{vector2}")
29
+ print(f"doing dot product for {key} and {term}")
30
+ dp = np.dot(vector1, vector2)
31
+ distance = np.linalg.norm(np.array(vector1) - np.array(vector2))
32
+ if debug:
33
+ print(f"the dp is {dp}")
34
+ print(f"the distance is{distance}")
35
+ print("")
36
+ print("")
37
+ print("")
38
+ sim_search_dict[key] = dp * distance
39
+
40
+ # sort with the biggest similarity
41
+ sorted_table = dict(sorted(sim_search_dict.items(), key=lambda item: item[1]), reversed=True)
42
+
43
+ if debug:
44
+ for key, value in sorted_table[:5]:
45
+ print(f"{key}: {value}")
46
+ if add_to_db:
47
+
48
+ if term in table.keys():
49
+ print("the search term is in the database!")
50
+ # add the newly vectorized term to the words, if not already in the vector table
51
+ else:
52
+ if database != "session/my_words_vec_table.json":
53
+ database = "session/my_vecs.json"
54
+ # table = load_df(database) # ??
55
+ table[str(term)] = vector1
56
+ my_1_writer.safe_my_dict_as_json(database, table)
57
+ # first_key, first_value = list(sortedTable.items())[0]
58
+ print(f"the closest word to your input is: {list(sorted_table.keys())[0]}")
59
+ return sorted_table
60
+
61
+
62
+ def dot_p_to_1(database, vector1=0, analysis_filename=0):
63
+
64
+ with open(database, "r") as f:
65
+ table = json.load(f)
66
+ dot_product_to1 = {}
67
+
68
+ if vector1 == 0:
69
+ vector1 = [0.025515518153991442 for _ in range(1536)]
70
+ elif vector1 == 1:
71
+ vector1 = table[str(list(table.keys())[0])]
72
+
73
+ for key in table.keys():
74
+ dot_product_to1[key] = np.dot(vector1, table[key])
75
+ my_1_writer.safe_my_dict_as_json(analysis_filename, dot_product_to1)
76
+ print("dot p to 1 saved")
77
+
78
+
79
+ def lin_dist(database, vector1=0, analysis_filename=0):
80
+ with open(database, "r") as f:
81
+ table = json.load(f)
82
+ lin_dist_to_1 = {}
83
+
84
+ if vector1 == 0:
85
+ vector1 = [0.025515518153991442 for _ in range(1536)]
86
+ elif vector1 == 1:
87
+ vector1 = table[str(list(table.keys())[0])]
88
+
89
+ for key in table.keys():
90
+ lin_dist_to_1[key] = np.linalg.norm(np.array(vector1) - np.array(table[key]))
91
+
92
+ my_1_writer.safe_my_dict_as_json(analysis_filename, lin_dist_to_1)
93
+ print("lin dist to 1 saved")
94
+
95
+
96
+ def manhattan_dist(database, vector1=0, analysis_filename=0):
97
+ with open(database, "r") as f:
98
+ table = json.load(f)
99
+ manhattan_dist_to_1 = {}
100
+
101
+ if vector1 == 0:
102
+ vector1 = [0.025515518153991442 for _ in range(1536)]
103
+ elif vector1 == 1:
104
+ vector1 = table[str(list(table.keys())[0])]
105
+
106
+ for key in table.keys():
107
+ manhattan_dist_to_1[key] = sum(np.array(vector1) - np.array(table[key]))
108
+
109
+ my_1_writer.safe_my_dict_as_json(analysis_filename, manhattan_dist_to_1)
110
+ print("manhattan dist to 1 saved")
111
+
112
+
113
+ #vec_table
114
+ def sim_search_fly(vec_table, term, debug=False):
115
+ if debug:
116
+ print(type(vec_table))
117
+ print(type(term))
118
+ print(type(vec_table[list(vec_table.keys())[0]]))
119
+ print("vec table:")
120
+ print(vec_table[list(vec_table.keys())[5]][:4])
121
+ print("search term")
122
+ print(term[:4])
123
+ if type(term) == str:
124
+ print("str")
125
+ vector1 = my_new_openai.vectorize_data(term)
126
+ elif type(term) == list:
127
+ print("list")
128
+ vector1 = term
129
+ else:
130
+ print("invalid search_term/search_vector format")
131
+ return
132
+
133
+ sim_search_dict = {}
134
+ for key in vec_table.keys():
135
+ vector2 = vec_table[key]
136
+ if debug:
137
+ print("")
138
+ print(f"{vector1}")
139
+ print(f"{vector2}")
140
+ print(f"doing dot product for {key} and {term}")
141
+ if vector2[0] == vector2[1] and vector2[3] == vector2[4] and vector2[5] == vector2[6]:
142
+ dp = 200
143
+ else:
144
+ dp = np.dot(vector1, vector2)
145
+ #distance = np.linalg.norm(np.array(vector1) - np.array(vector2))
146
+ if debug:
147
+ print(f"the dp is {dp}")
148
+ #print(f"the distance is{distance}")
149
+ print("")
150
+ print("")
151
+ print("")
152
+ sim_search_dict[key] = dp #* distance
153
+
154
+ # sort with the biggest similarity
155
+ sorted_table = dict(sorted(sim_search_dict.items(), key=lambda item: item[1]), reversed=True)
156
+
157
+ if debug:
158
+ for key, value in sorted_table[:5]:
159
+ print(f"{key}: {value}")
160
+
161
+ # first_key, first_value = list(sortedTable.items())[0]
162
+ print(f"the closest word to your input is: {list(sorted_table.keys())[0]}")
163
+ return sorted_table
164
+
my_new_openai.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from openai import OpenAI
3
+ import requests
4
+ import base64
5
+ from pydub import AudioSegment
6
+ from moviepy.editor import VideoFileClip
7
+
8
+ client = OpenAI()
9
+
10
+
11
+ def image_bytes_to_base64(image_bytes):
12
+ """
13
+ Converts an image from bytes to a Base64 encoded string.
14
+
15
+ Args:
16
+ image_bytes (bytes): Byte content of the image.
17
+
18
+ Returns:
19
+ str: A Base64 encoded string of the image.
20
+ """
21
+ return base64.b64encode(image_bytes).decode('utf-8')
22
+
23
+
24
+ def image_to_base64(image_path):
25
+ with open(image_path, "rb") as image_file:
26
+ return str(base64.b64encode(image_file.read()).decode('utf-8'))
27
+
28
+
29
+ def gpt4_new(prompt_text):
30
+ gpt_response = client.chat.completions.create(
31
+ model="gpt-4",
32
+ messages=[{"role": "system",
33
+ "content": "Du bist eine Maschine, die Dokumente klassifiziert."},
34
+ {"role": "user", "content": prompt_text}])
35
+ return gpt_response.choices[0].message.content
36
+
37
+
38
+ def vectorize_data(data_input):
39
+ # input can be list or string:
40
+
41
+ if isinstance(data_input, list):
42
+ # returning a dictionary
43
+ my_dict = {}
44
+ for item in data_input:
45
+ my_dict[str(item)] = client.embeddings.create(input=data_input,
46
+ model="text-embedding-ada-002").data[0].embedding
47
+ return my_dict
48
+
49
+ elif isinstance(data_input, str):
50
+ # returning just the vector
51
+ return client.embeddings.create(input=data_input, model="text-embedding-ada-002").data[0].embedding
52
+
53
+ else:
54
+ print("none")
55
+
56
+
57
+ def img_create(prompt="a nice house on the beach", download_path=""):
58
+ # to open, must download
59
+ my_url = client.images.generate(model="dall-e-3", prompt=prompt, size="1024x1024").data[0].url
60
+ if download_path:
61
+ my_image = requests.get(my_url)
62
+ if my_image.status_code == 200:
63
+ with open(download_path, 'wb') as f:
64
+ f.write(my_image.content)
65
+ else:
66
+ print("Failed to retrieve image")
67
+ return my_url
68
+
69
+
70
+ def img_to_text(img_url="", img_base64="", prompt="What’s in this image?", print_out=True):
71
+ if img_url:
72
+ img_desc_response = client.chat.completions.create(
73
+ model="gpt-4-turbo",
74
+ messages=[
75
+ {
76
+ "role": "user",
77
+ "content": [
78
+ {"type": "text", "text": prompt},
79
+ {
80
+ "type": "image_url",
81
+ "image_url": {
82
+ "url": img_url,
83
+ },
84
+ },
85
+ ],
86
+ }
87
+ ],
88
+ max_tokens=500,
89
+ )
90
+ if print_out:
91
+ print(img_desc_response.choices[0].message.content)
92
+ return img_desc_response.choices[0].message.content
93
+ elif img_base64:
94
+ headers = {
95
+ "Content-Type": "application/json",
96
+ "Authorization": f"Bearer {os.environ['OPENAI_API_KEY']}"
97
+ }
98
+ payload = {
99
+ "model": "gpt-4-turbo",
100
+ "messages": [
101
+ {
102
+ "role": "user",
103
+ "content": [
104
+ {
105
+ "type": "text",
106
+ "text": prompt
107
+ },
108
+ {
109
+ "type": "image_url",
110
+ "image_url": {
111
+ "url": f"data:image/jpeg;base64,{img_base64}"
112
+ }
113
+ }
114
+ ]
115
+ }
116
+ ],
117
+ "max_tokens": 300
118
+ }
119
+ img_desc_response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
120
+ if print_out:
121
+ print(img_desc_response.json()["choices"][0]["message"]["content"])
122
+ return img_desc_response.json()["choices"][0]["message"]["content"]
123
+ else:
124
+ return ValueError
125
+
126
+
127
+ def encode_image_to_base64(image_path):
128
+ with open(image_path, "rb") as image_file:
129
+ encoded_string = base64.b64encode(image_file.read()).decode('utf-8')
130
+ return encoded_string
131
+
132
+
133
+ def mp4_to_mp3(video_file_path, audio_file_path):
134
+ # Load the video file
135
+ video = VideoFileClip(video_file_path)
136
+
137
+ # Extract audio from the video and write it to an MP3 file
138
+ video.audio.write_audiofile(audio_file_path)
139
+
140
+ # Close the video file to free resources
141
+ video.close()
142
+
143
+ print(f"Converted {video_file_path} to {audio_file_path}")
144
+
145
+
146
+ def mp4_audio_to_mp3(mp4_audio_path, mp3_output_path):
147
+ # Load the MP4 file
148
+ audio = AudioSegment.from_file(mp4_audio_path, format="mp4")
149
+
150
+ # Export as an MP3 file
151
+ audio.export(mp3_output_path, format="mp3")
152
+
153
+ print(f"Converted {mp4_audio_path} to {mp3_output_path}")
154
+
155
+
156
+ def table_to_text(table=None, prompt="describe this table in plain text. "
157
+ "be as precise as possible. spare no detail. "
158
+ "what is in this table?", print_out=True):
159
+ if table is not None:
160
+ response = gpt4_new(f"{prompt} TABLE: {table}")
161
+ if print_out:
162
+ print(response)
163
+ return response
164
+ else:
165
+ return ValueError
166
+
167
+
168
+ def danja():
169
+ #mp4_file = "C:\\Users\\eliaw\\Downloads\\WhatsApp Audio 2024-05-10 at 22.17.12.mp4"
170
+
171
+ #mp3_file = "output_audio.mp3"
172
+ mp3_file = "C:\\Users\\eliaw\\Downloads\\WhatsApp Audio 2024-05-10 at 22.17.12.mp3"
173
+
174
+ # mp4_audio_to_mp3(mp4_file, mp3_file)
175
+
176
+ # Usage example
177
+ # mp4_to_mp3(mp4_file, mp3_file)
178
+
179
+ audio_file = open(mp3_file, "rb")
180
+ transcription = client.audio.transcriptions.create(
181
+ model="whisper-1",
182
+ file=audio_file
183
+ )
184
+ print(transcription.text)
185
+
186
+
187
+ if __name__ == "__main__":
188
+ #print("here are all functions that directly call openai.")
189
+ #img_create("a skier in the swiss alps", download_path="skier.png")
190
+ #img_to_text(img_base64=encode_image_to_base64("skier.png"))
191
+ #print(image_to_base64("skier.png"))
192
+ #print(vectorize_data("test string"))
193
+
194
+ print(gpt4_new())
195
+
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit~=1.33.0
2
+ bcrypt~=4.1.2
3
+ psycopg2-binary~=2.9.9
4
+ openai~=1.23.2
5
+ pypdf2~=3.0.1
6
+ langchain~=0.1.16
7
+ tiktoken~=0.6.0
8
+ numpy~=1.26.4
9
+ requests~=2.31.0
10
+ faiss-cpu
setup_db.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import openpyxl
3
+ import my_new_openai
4
+
5
+
6
+ def update_excel_with_sums(filename):
7
+ # Load the workbook and select the active worksheet
8
+ workbook = openpyxl.load_workbook(filename)
9
+ sheet = workbook.active
10
+
11
+ # Iterate through each row in the sheet
12
+ for row in sheet.iter_rows(min_row=1, min_col=2, max_col=3):
13
+ Bn, Cn = row # Assuming B and C are columns 2 and 3 respectively
14
+ vector = my_new_openai.vectorize_data(f"{Bn.value}: {Cn.value}") if Bn.value and Cn.value else 0
15
+ if vector != 0:
16
+ for val in vector:
17
+ sheet.cell(row=Bn.row, column=4+vector.index(val)).value = val
18
+
19
+ # Save the workbook
20
+ workbook.save(filename)
21
+ print(f"Updated the file '{filename}' with vectors in column D.")
22
+
23
+
24
+ def load_vectorstore_from_excel(filename):
25
+ # returns a dictonary
26
+ # Load the workbook and select the active worksheet
27
+ workbook = openpyxl.load_workbook(filename)
28
+ sheet = workbook.active
29
+
30
+ # Iterate through each row in the sheet
31
+ vec_store = {}
32
+ for row in range(3, 634):
33
+ vec = []
34
+ for col in range(0, 1536):
35
+ val = sheet.cell(row=row, column=4+col).value
36
+ vec.append(val)
37
+ vec_store[str(sheet.cell(row=row, column=1).value)] = vec
38
+ return vec_store
39
+
40
+
41
+ if __name__ == '__main__':
42
+ #update_excel_with_sums("KBOB_Klassifizierung.xlsx")
43
+ t = time.time()
44
+
45
+ vec_store = load_vectorstore_from_excel("KBOB_Klassifizierung.xlsx")
46
+
47
+ print(time.time()-t)
48
+ for e in vec_store.keys():
49
+ print(f"{e}: {vec_store[e][0]}, {vec_store[e][1]}, .... {vec_store[e][-1]}")
50
+