Spaces:
Runtime error
Runtime error
Upload 14 files
Browse files- .gitattributes +37 -35
- .gitignore +160 -0
- KBOB_Klassifizierung.xlsx +3 -0
- LICENSE +201 -0
- README.md +10 -13
- app.py +218 -0
- ask_app.py +243 -0
- classify_app.py +197 -0
- ingest.py +130 -0
- my_1_reader.py +201 -0
- my_2_sim_search.py +164 -0
- my_new_openai.py +195 -0
- requirements.txt +10 -0
- setup_db.py +50 -0
.gitattributes
CHANGED
@@ -1,35 +1,37 @@
|
|
1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
U3_alle/faiss_index.index/index.faiss filter=lfs diff=lfs merge=lfs -text
|
37 |
+
KBOB_Klassifizierung.xlsx filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
share/python-wheels/
|
24 |
+
*.egg-info/
|
25 |
+
.installed.cfg
|
26 |
+
*.egg
|
27 |
+
MANIFEST
|
28 |
+
|
29 |
+
# PyInstaller
|
30 |
+
# Usually these files are written by a python script from a template
|
31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
32 |
+
*.manifest
|
33 |
+
*.spec
|
34 |
+
|
35 |
+
# Installer logs
|
36 |
+
pip-log.txt
|
37 |
+
pip-delete-this-directory.txt
|
38 |
+
|
39 |
+
# Unit test / coverage reports
|
40 |
+
htmlcov/
|
41 |
+
.tox/
|
42 |
+
.nox/
|
43 |
+
.coverage
|
44 |
+
.coverage.*
|
45 |
+
.cache
|
46 |
+
nosetests.xml
|
47 |
+
coverage.xml
|
48 |
+
*.cover
|
49 |
+
*.py,cover
|
50 |
+
.hypothesis/
|
51 |
+
.pytest_cache/
|
52 |
+
cover/
|
53 |
+
|
54 |
+
# Translations
|
55 |
+
*.mo
|
56 |
+
*.pot
|
57 |
+
|
58 |
+
# Django stuff:
|
59 |
+
*.log
|
60 |
+
local_settings.py
|
61 |
+
db.sqlite3
|
62 |
+
db.sqlite3-journal
|
63 |
+
|
64 |
+
# Flask stuff:
|
65 |
+
instance/
|
66 |
+
.webassets-cache
|
67 |
+
|
68 |
+
# Scrapy stuff:
|
69 |
+
.scrapy
|
70 |
+
|
71 |
+
# Sphinx documentation
|
72 |
+
docs/_build/
|
73 |
+
|
74 |
+
# PyBuilder
|
75 |
+
.pybuilder/
|
76 |
+
target/
|
77 |
+
|
78 |
+
# Jupyter Notebook
|
79 |
+
.ipynb_checkpoints
|
80 |
+
|
81 |
+
# IPython
|
82 |
+
profile_default/
|
83 |
+
ipython_config.py
|
84 |
+
|
85 |
+
# pyenv
|
86 |
+
# For a library or package, you might want to ignore these files since the code is
|
87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
88 |
+
# .python-version
|
89 |
+
|
90 |
+
# pipenv
|
91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
94 |
+
# install all needed dependencies.
|
95 |
+
#Pipfile.lock
|
96 |
+
|
97 |
+
# poetry
|
98 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
100 |
+
# commonly ignored for libraries.
|
101 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
102 |
+
#poetry.lock
|
103 |
+
|
104 |
+
# pdm
|
105 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
106 |
+
#pdm.lock
|
107 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
108 |
+
# in version control.
|
109 |
+
# https://pdm.fming.dev/#use-with-ide
|
110 |
+
.pdm.toml
|
111 |
+
|
112 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
113 |
+
__pypackages__/
|
114 |
+
|
115 |
+
# Celery stuff
|
116 |
+
celerybeat-schedule
|
117 |
+
celerybeat.pid
|
118 |
+
|
119 |
+
# SageMath parsed files
|
120 |
+
*.sage.py
|
121 |
+
|
122 |
+
# Environments
|
123 |
+
.env
|
124 |
+
.venv
|
125 |
+
env/
|
126 |
+
venv/
|
127 |
+
ENV/
|
128 |
+
env.bak/
|
129 |
+
venv.bak/
|
130 |
+
|
131 |
+
# Spyder project settings
|
132 |
+
.spyderproject
|
133 |
+
.spyproject
|
134 |
+
|
135 |
+
# Rope project settings
|
136 |
+
.ropeproject
|
137 |
+
|
138 |
+
# mkdocs documentation
|
139 |
+
/site
|
140 |
+
|
141 |
+
# mypy
|
142 |
+
.mypy_cache/
|
143 |
+
.dmypy.json
|
144 |
+
dmypy.json
|
145 |
+
|
146 |
+
# Pyre type checker
|
147 |
+
.pyre/
|
148 |
+
|
149 |
+
# pytype static type analyzer
|
150 |
+
.pytype/
|
151 |
+
|
152 |
+
# Cython debug symbols
|
153 |
+
cython_debug/
|
154 |
+
|
155 |
+
# PyCharm
|
156 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
157 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
158 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
159 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
160 |
+
#.idea/
|
KBOB_Klassifizierung.xlsx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dd8797003fcbcfe6e52b7b8f2fcf56c8639e8807099f6af786bfddb5713a9761
|
3 |
+
size 13370614
|
LICENSE
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Apache License
|
2 |
+
Version 2.0, January 2004
|
3 |
+
http://www.apache.org/licenses/
|
4 |
+
|
5 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
6 |
+
|
7 |
+
1. Definitions.
|
8 |
+
|
9 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
10 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
11 |
+
|
12 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
13 |
+
the copyright owner that is granting the License.
|
14 |
+
|
15 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
16 |
+
other entities that control, are controlled by, or are under common
|
17 |
+
control with that entity. For the purposes of this definition,
|
18 |
+
"control" means (i) the power, direct or indirect, to cause the
|
19 |
+
direction or management of such entity, whether by contract or
|
20 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
21 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
22 |
+
|
23 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
24 |
+
exercising permissions granted by this License.
|
25 |
+
|
26 |
+
"Source" form shall mean the preferred form for making modifications,
|
27 |
+
including but not limited to software source code, documentation
|
28 |
+
source, and configuration files.
|
29 |
+
|
30 |
+
"Object" form shall mean any form resulting from mechanical
|
31 |
+
transformation or translation of a Source form, including but
|
32 |
+
not limited to compiled object code, generated documentation,
|
33 |
+
and conversions to other media types.
|
34 |
+
|
35 |
+
"Work" shall mean the work of authorship, whether in Source or
|
36 |
+
Object form, made available under the License, as indicated by a
|
37 |
+
copyright notice that is included in or attached to the work
|
38 |
+
(an example is provided in the Appendix below).
|
39 |
+
|
40 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
41 |
+
form, that is based on (or derived from) the Work and for which the
|
42 |
+
editorial revisions, annotations, elaborations, or other modifications
|
43 |
+
represent, as a whole, an original work of authorship. For the purposes
|
44 |
+
of this License, Derivative Works shall not include works that remain
|
45 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
46 |
+
the Work and Derivative Works thereof.
|
47 |
+
|
48 |
+
"Contribution" shall mean any work of authorship, including
|
49 |
+
the original version of the Work and any modifications or additions
|
50 |
+
to that Work or Derivative Works thereof, that is intentionally
|
51 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
52 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
53 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
54 |
+
means any form of electronic, verbal, or written communication sent
|
55 |
+
to the Licensor or its representatives, including but not limited to
|
56 |
+
communication on electronic mailing lists, source code control systems,
|
57 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
58 |
+
Licensor for the purpose of discussing and improving the Work, but
|
59 |
+
excluding communication that is conspicuously marked or otherwise
|
60 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
61 |
+
|
62 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
63 |
+
on behalf of whom a Contribution has been received by Licensor and
|
64 |
+
subsequently incorporated within the Work.
|
65 |
+
|
66 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
67 |
+
this License, each Contributor hereby grants to You a perpetual,
|
68 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
69 |
+
copyright license to reproduce, prepare Derivative Works of,
|
70 |
+
publicly display, publicly perform, sublicense, and distribute the
|
71 |
+
Work and such Derivative Works in Source or Object form.
|
72 |
+
|
73 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
74 |
+
this License, each Contributor hereby grants to You a perpetual,
|
75 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
76 |
+
(except as stated in this section) patent license to make, have made,
|
77 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
78 |
+
where such license applies only to those patent claims licensable
|
79 |
+
by such Contributor that are necessarily infringed by their
|
80 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
81 |
+
with the Work to which such Contribution(s) was submitted. If You
|
82 |
+
institute patent litigation against any entity (including a
|
83 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
84 |
+
or a Contribution incorporated within the Work constitutes direct
|
85 |
+
or contributory patent infringement, then any patent licenses
|
86 |
+
granted to You under this License for that Work shall terminate
|
87 |
+
as of the date such litigation is filed.
|
88 |
+
|
89 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
90 |
+
Work or Derivative Works thereof in any medium, with or without
|
91 |
+
modifications, and in Source or Object form, provided that You
|
92 |
+
meet the following conditions:
|
93 |
+
|
94 |
+
(a) You must give any other recipients of the Work or
|
95 |
+
Derivative Works a copy of this License; and
|
96 |
+
|
97 |
+
(b) You must cause any modified files to carry prominent notices
|
98 |
+
stating that You changed the files; and
|
99 |
+
|
100 |
+
(c) You must retain, in the Source form of any Derivative Works
|
101 |
+
that You distribute, all copyright, patent, trademark, and
|
102 |
+
attribution notices from the Source form of the Work,
|
103 |
+
excluding those notices that do not pertain to any part of
|
104 |
+
the Derivative Works; and
|
105 |
+
|
106 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
107 |
+
distribution, then any Derivative Works that You distribute must
|
108 |
+
include a readable copy of the attribution notices contained
|
109 |
+
within such NOTICE file, excluding those notices that do not
|
110 |
+
pertain to any part of the Derivative Works, in at least one
|
111 |
+
of the following places: within a NOTICE text file distributed
|
112 |
+
as part of the Derivative Works; within the Source form or
|
113 |
+
documentation, if provided along with the Derivative Works; or,
|
114 |
+
within a display generated by the Derivative Works, if and
|
115 |
+
wherever such third-party notices normally appear. The contents
|
116 |
+
of the NOTICE file are for informational purposes only and
|
117 |
+
do not modify the License. You may add Your own attribution
|
118 |
+
notices within Derivative Works that You distribute, alongside
|
119 |
+
or as an addendum to the NOTICE text from the Work, provided
|
120 |
+
that such additional attribution notices cannot be construed
|
121 |
+
as modifying the License.
|
122 |
+
|
123 |
+
You may add Your own copyright statement to Your modifications and
|
124 |
+
may provide additional or different license terms and conditions
|
125 |
+
for use, reproduction, or distribution of Your modifications, or
|
126 |
+
for any such Derivative Works as a whole, provided Your use,
|
127 |
+
reproduction, and distribution of the Work otherwise complies with
|
128 |
+
the conditions stated in this License.
|
129 |
+
|
130 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
131 |
+
any Contribution intentionally submitted for inclusion in the Work
|
132 |
+
by You to the Licensor shall be under the terms and conditions of
|
133 |
+
this License, without any additional terms or conditions.
|
134 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
135 |
+
the terms of any separate license agreement you may have executed
|
136 |
+
with Licensor regarding such Contributions.
|
137 |
+
|
138 |
+
6. Trademarks. This License does not grant permission to use the trade
|
139 |
+
names, trademarks, service marks, or product names of the Licensor,
|
140 |
+
except as required for reasonable and customary use in describing the
|
141 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
142 |
+
|
143 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
144 |
+
agreed to in writing, Licensor provides the Work (and each
|
145 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
146 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
147 |
+
implied, including, without limitation, any warranties or conditions
|
148 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
149 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
150 |
+
appropriateness of using or redistributing the Work and assume any
|
151 |
+
risks associated with Your exercise of permissions under this License.
|
152 |
+
|
153 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
154 |
+
whether in tort (including negligence), contract, or otherwise,
|
155 |
+
unless required by applicable law (such as deliberate and grossly
|
156 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
157 |
+
liable to You for damages, including any direct, indirect, special,
|
158 |
+
incidental, or consequential damages of any character arising as a
|
159 |
+
result of this License or out of the use or inability to use the
|
160 |
+
Work (including but not limited to damages for loss of goodwill,
|
161 |
+
work stoppage, computer failure or malfunction, or any and all
|
162 |
+
other commercial damages or losses), even if such Contributor
|
163 |
+
has been advised of the possibility of such damages.
|
164 |
+
|
165 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
166 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
167 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
168 |
+
or other liability obligations and/or rights consistent with this
|
169 |
+
License. However, in accepting such obligations, You may act only
|
170 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
171 |
+
of any other Contributor, and only if You agree to indemnify,
|
172 |
+
defend, and hold each Contributor harmless for any liability
|
173 |
+
incurred by, or claims asserted against, such Contributor by reason
|
174 |
+
of your accepting any such warranty or additional liability.
|
175 |
+
|
176 |
+
END OF TERMS AND CONDITIONS
|
177 |
+
|
178 |
+
APPENDIX: How to apply the Apache License to your work.
|
179 |
+
|
180 |
+
To apply the Apache License to your work, attach the following
|
181 |
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
182 |
+
replaced with your own identifying information. (Don't include
|
183 |
+
the brackets!) The text should be enclosed in the appropriate
|
184 |
+
comment syntax for the file format. We also recommend that a
|
185 |
+
file or class name and description of purpose be included on the
|
186 |
+
same "printed page" as the copyright notice for easier
|
187 |
+
identification within third-party archives.
|
188 |
+
|
189 |
+
Copyright [yyyy] [name of copyright owner]
|
190 |
+
|
191 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
192 |
+
you may not use this file except in compliance with the License.
|
193 |
+
You may obtain a copy of the License at
|
194 |
+
|
195 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
196 |
+
|
197 |
+
Unless required by applicable law or agreed to in writing, software
|
198 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
199 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
200 |
+
See the License for the specific language governing permissions and
|
201 |
+
limitations under the License.
|
README.md
CHANGED
@@ -1,13 +1,10 @@
|
|
1 |
-
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo: indigo
|
6 |
-
sdk: streamlit
|
7 |
-
sdk_version: 1.
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
|
11 |
-
---
|
12 |
-
|
13 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
+
---
|
2 |
+
title: ask-ASH
|
3 |
+
emoji: 🏥
|
4 |
+
colorFrom: yellow
|
5 |
+
colorTo: indigo
|
6 |
+
sdk: streamlit
|
7 |
+
sdk_version: 1.33.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
---
|
|
|
|
|
|
app.py
ADDED
@@ -0,0 +1,218 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
testing my own vectors
|
3 |
+
"""
|
4 |
+
import ingest
|
5 |
+
import my_2_sim_search
|
6 |
+
import my_new_openai
|
7 |
+
import setup_db
|
8 |
+
import time
|
9 |
+
import streamlit as st
|
10 |
+
import os
|
11 |
+
import my_vectors
|
12 |
+
|
13 |
+
|
14 |
+
def merge_indices(index1, index2):
|
15 |
+
"""
|
16 |
+
Merge two indices into a new index, assuming both are of the same type and dimensionality.
|
17 |
+
"""
|
18 |
+
pass
|
19 |
+
|
20 |
+
|
21 |
+
def handle_userinput(user_question):
|
22 |
+
pass
|
23 |
+
|
24 |
+
|
25 |
+
def save_uploaded_file(uploaded_file):
|
26 |
+
try:
|
27 |
+
# Create a static folder if it doesn't exist
|
28 |
+
if not os.path.exists('static'):
|
29 |
+
os.makedirs('static')
|
30 |
+
|
31 |
+
# Write the uploaded file to a new file in the static directory
|
32 |
+
with open(os.path.join('static', uploaded_file.name), "wb") as f:
|
33 |
+
f.write(uploaded_file.getbuffer())
|
34 |
+
return True
|
35 |
+
except Exception as e:
|
36 |
+
print(e)
|
37 |
+
return False
|
38 |
+
|
39 |
+
|
40 |
+
def main():
|
41 |
+
st.set_page_config(page_title="Anna Seiler Haus KI-Assistent", page_icon=":hospital:")
|
42 |
+
if True:
|
43 |
+
if "conversation" not in sst:
|
44 |
+
sst.conversation = None
|
45 |
+
if "chat_history" not in sst:
|
46 |
+
sst.chat_history = None
|
47 |
+
if "page" not in sst:
|
48 |
+
sst.page = "home"
|
49 |
+
if "openai" not in sst:
|
50 |
+
sst.openai = True
|
51 |
+
if "login" not in sst:
|
52 |
+
sst.login = False
|
53 |
+
if 'submitted_user_query' not in sst:
|
54 |
+
sst.submitted_user_query = ''
|
55 |
+
if 'submitted_user_safe' not in sst:
|
56 |
+
sst.submitted_user_safe = ''
|
57 |
+
if 'submitted_user_load' not in sst:
|
58 |
+
sst.submitted_user_load = ''
|
59 |
+
if 'widget_user_load' not in sst:
|
60 |
+
sst.widget_user_load = 'U3_alle' # Init the vectorstore
|
61 |
+
if 'vectorstore' not in sst:
|
62 |
+
sst.vectorstore = None
|
63 |
+
|
64 |
+
def submit_user_query():
|
65 |
+
sst.submitted_user_query = sst.widget_user_query
|
66 |
+
sst.widget_user_query = ''
|
67 |
+
|
68 |
+
def submit_user_safe():
|
69 |
+
sst.submitted_user_safe = sst.widget_user_safe
|
70 |
+
sst.widget_user_safe = ''
|
71 |
+
if sst.vectorstore is not None:
|
72 |
+
my_vectors.save_local(sst.vectorstore, path=sst.submitted_user_safe)
|
73 |
+
st.sidebar.success("saved")
|
74 |
+
else:
|
75 |
+
st.sidebar.warning("No embeddings to save. Please process documents first.")
|
76 |
+
|
77 |
+
def submit_user_load():
|
78 |
+
sst.submitted_user_load = sst.widget_user_load
|
79 |
+
sst.widget_user_load = ''
|
80 |
+
if os.path.exists(sst.submitted_user_load):
|
81 |
+
new_db = my_vectors.load_local(f"{sst.submitted_user_load}/faiss_index.index")
|
82 |
+
if sst.vectorstore is not None:
|
83 |
+
if new_db is not None: # Check if this is working
|
84 |
+
st.sidebar.success("Vectors loaded")
|
85 |
+
else:
|
86 |
+
if new_db is not None: # Check if this is working
|
87 |
+
sst.vectorstore = new_db
|
88 |
+
st.sidebar.success("Vectors loaded")
|
89 |
+
else:
|
90 |
+
st.sidebar.warning("Couldn't load/find embeddings")
|
91 |
+
|
92 |
+
st.header("Anna Seiler Haus KI-Assistent ASH :hospital:")
|
93 |
+
if st.toggle("show README"):
|
94 |
+
|
95 |
+
st.subheader("Funktion: ")
|
96 |
+
st.write("dieses proof-of-concept von Elia Wäfler demonstriert das Potential von RAG (Retrival Augmented Generation) für BIM2FM Dokumentenablagen am Beispiel Dokumente U3 ASH (Anna Seiler Haus, Inselspital Bern). chatte mit den Dokumenten, oder lade selber ein oder mehrere PDF-Dokumente hoch, um RAG auszuprobieren. die vektoren werden lokal oder im st.session_state gespeichert. Feedback und Bugs gerne an elia.waefler@insel.ch")
|
97 |
+
st.write("Vielen Dank.")
|
98 |
+
st.write("")
|
99 |
+
|
100 |
+
st.subheader("Licence and credits")
|
101 |
+
st.write("THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.")
|
102 |
+
st.write("special thanks to OpenAI, STREAMLIT, HUGGINGFACE, LANGCHAIN and alejandro-ao")
|
103 |
+
l, r = st.columns(2)
|
104 |
+
with l:
|
105 |
+
st.subheader("Limitationen: ")
|
106 |
+
st.write("bisher nur Text aus PDFs")
|
107 |
+
st.write("macht Fehler, kann falsche Informationen geben")
|
108 |
+
st.write("prompts werden bisher nicht geprüft")
|
109 |
+
st.write("")
|
110 |
+
with r:
|
111 |
+
st.subheader("geplante Erweiterungen:")
|
112 |
+
st.write("Tabellen, Bilder werden auch vektorisiert, um die retrival qualität zu verbessern")
|
113 |
+
st.write("on premise anwendung mit mistral 7b oder vergleichbar")
|
114 |
+
st.write("Ecodomus API einbinden, um alle Dokumente einzubinden.")
|
115 |
+
st.write("")
|
116 |
+
|
117 |
+
if sst.login:
|
118 |
+
if st.toggle("RAG / classifier"):
|
119 |
+
#user_question = st.text_input("Ask a question about your documents:", key="user_query", on_change=handle_query)
|
120 |
+
st.text_input('Ask a question about your documents:', key='widget_user_query', on_change=submit_user_query)
|
121 |
+
#sst.openai = st.toggle(label="use openai?")
|
122 |
+
if sst.submitted_user_query:
|
123 |
+
if sst.vectorstore is not None:
|
124 |
+
handle_userinput(sst.submitted_user_query)
|
125 |
+
sst.submitted_user_query = False
|
126 |
+
else:
|
127 |
+
st.warning("no vectorstore loaded.")
|
128 |
+
|
129 |
+
with st.sidebar:
|
130 |
+
st.subheader("Your documents")
|
131 |
+
pdf_docs = st.file_uploader("Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
|
132 |
+
if st.button("Process"):
|
133 |
+
with st.spinner("Processing"):
|
134 |
+
vec = ingest.get_text_chunks(ingest.get_pdf_text(pdf_docs))
|
135 |
+
st.warning("only text")
|
136 |
+
sst.vectorstore = vec
|
137 |
+
sst.conversation = vec
|
138 |
+
st.success("embedding complete")
|
139 |
+
st.text_input('Safe Embeddings to: (copy path of folder)', key='widget_user_safe',
|
140 |
+
on_change=submit_user_safe)
|
141 |
+
st.text_input('Load Embeddings from: (copy path of folder)', key='widget_user_load',
|
142 |
+
on_change=submit_user_load)
|
143 |
+
if st.toggle("reset vectorstore?"):
|
144 |
+
if st.button("Yes, reset"):
|
145 |
+
sst.vectorstore = None
|
146 |
+
st.warning("vectorstore reset complete")
|
147 |
+
else:
|
148 |
+
st.warning("unsaved embeddings will be lost.")
|
149 |
+
else:
|
150 |
+
file = st.file_uploader("upload file", accept_multiple_files=False)
|
151 |
+
vec_store = setup_db.load_vectorstore_from_excel("KBOB_Klassifizierung.xlsx")
|
152 |
+
if st.button("classify me!"):
|
153 |
+
with st.spinner("Classifying..."):
|
154 |
+
query_vecs = []
|
155 |
+
if file.type == "application/pdf":
|
156 |
+
one, two, three, four, five = st.columns(5)
|
157 |
+
text = ingest.get_pdf_text(file)
|
158 |
+
with one:
|
159 |
+
st.success("text")
|
160 |
+
# ONE OR MULTIPLE IS THE QUESTION
|
161 |
+
imgs = ingest.get_pdf_images(file.getvalue())
|
162 |
+
if type(imgs) != list:
|
163 |
+
imgs = [imgs]
|
164 |
+
for img in imgs:
|
165 |
+
text += my_new_openai.img_to_text(img_base64=my_new_openai.image_bytes_to_base64(img))
|
166 |
+
with two:
|
167 |
+
st.success("imgs")
|
168 |
+
|
169 |
+
tabs = ingest.get_pdf_tables(file.getvalue())
|
170 |
+
|
171 |
+
if type(tabs) != list:
|
172 |
+
tabs = [tabs]
|
173 |
+
for tab in tabs:
|
174 |
+
text += my_new_openai.table_to_text(table=tab)
|
175 |
+
with three:
|
176 |
+
st.success("tabs")
|
177 |
+
full_search = my_new_openai.vectorize_data(text)
|
178 |
+
detail_search = [my_new_openai.vectorize_data(_) for _ in ingest.get_text_chunks(text)]
|
179 |
+
with four:
|
180 |
+
st.success("vecs")
|
181 |
+
st.write(len(list(vec_store.keys())))
|
182 |
+
sorted_vec_table = my_2_sim_search.sim_search_fly(vec_table=vec_store, term=full_search)
|
183 |
+
st.success("sim search")
|
184 |
+
st.write(f"len of list of categories {len(list(sorted_vec_table.keys()))}")
|
185 |
+
for category in list(sorted_vec_table.keys())[:3]:
|
186 |
+
st.write(category)
|
187 |
+
for category in list(sorted_vec_table.keys())[-3:]:
|
188 |
+
st.write(category)
|
189 |
+
for vec in detail_search:
|
190 |
+
pass
|
191 |
+
else:
|
192 |
+
st.error()
|
193 |
+
else:
|
194 |
+
user_pw = st.text_input("ASK_ASH_PASSWORD: ", type="password")
|
195 |
+
if st.button("check"):
|
196 |
+
time.sleep(0.5)
|
197 |
+
if user_pw == ASK_ASH_PASSWORD:
|
198 |
+
sst.login = True
|
199 |
+
if "first_load" not in sst:
|
200 |
+
submit_user_load()
|
201 |
+
sst.first_load = True
|
202 |
+
st.rerun()
|
203 |
+
|
204 |
+
|
205 |
+
if __name__ == '__main__':
|
206 |
+
if True:
|
207 |
+
BASE_URL = "https://api.vectara.io/v1"
|
208 |
+
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
|
209 |
+
OPENAI_ORG_ID = os.environ["OPENAI_ORG_ID"]
|
210 |
+
PINECONE_API_KEY = os.environ["PINECONE_API_KEY_LCBIM"]
|
211 |
+
HUGGINGFACEHUB_API_TOKEN = os.environ["HUGGINGFACEHUB_API_TOKEN"]
|
212 |
+
VECTARA_API_KEY = os.environ["VECTARA_API_KEY"]
|
213 |
+
VECTARA_CUSTOMER_ID = os.environ["VECTARA_CUSTOMER_ID"]
|
214 |
+
headers = {"Authorization": f"Bearer {VECTARA_API_KEY}", "Content-Type": "application/json"}
|
215 |
+
|
216 |
+
sst = st.session_state
|
217 |
+
ASK_ASH_PASSWORD = os.environ["ASK_ASH_PASSWORD"]
|
218 |
+
main()
|
ask_app.py
ADDED
@@ -0,0 +1,243 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
complete, functional RAG App
|
3 |
+
stores vectors in session state, or locally.
|
4 |
+
add function to display retrieved documents
|
5 |
+
"""
|
6 |
+
|
7 |
+
# import time
|
8 |
+
from datetime import datetime
|
9 |
+
# import openai
|
10 |
+
# import tiktoken
|
11 |
+
import streamlit as st
|
12 |
+
from PyPDF2 import PdfReader
|
13 |
+
from langchain.text_splitter import CharacterTextSplitter
|
14 |
+
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
|
15 |
+
from langchain.vectorstores import FAISS
|
16 |
+
from langchain.chat_models import ChatOpenAI
|
17 |
+
from langchain.memory import ConversationBufferMemory
|
18 |
+
from langchain.chains import ConversationalRetrievalChain
|
19 |
+
from html_templates import css, bot_template, user_template
|
20 |
+
from langchain.llms import HuggingFaceHub
|
21 |
+
import os
|
22 |
+
import numpy as np
|
23 |
+
import faiss_utils
|
24 |
+
from langchain_community.vectorstores import FAISS
|
25 |
+
from langchain.embeddings import OpenAIEmbeddings
|
26 |
+
|
27 |
+
|
28 |
+
def merge_faiss_indices(index1, index2):
|
29 |
+
"""
|
30 |
+
Merge two FAISS indices into a new index, assuming both are of the same type and dimensionality.
|
31 |
+
|
32 |
+
Args:
|
33 |
+
index1 (faiss.Index): The first FAISS index.
|
34 |
+
index2 (faiss.Index): The second FAISS index.
|
35 |
+
|
36 |
+
Returns:
|
37 |
+
faiss.Index: A new FAISS index containing all vectors from index1 and index2.
|
38 |
+
"""
|
39 |
+
|
40 |
+
# Check if both indices are the same type
|
41 |
+
if type(index1) != type(index2):
|
42 |
+
raise ValueError("Indices are of different types")
|
43 |
+
|
44 |
+
# Check dimensionality
|
45 |
+
if index1.d != index2.d:
|
46 |
+
raise ValueError("Indices have different dimensionality")
|
47 |
+
|
48 |
+
# Determine type of indices
|
49 |
+
if isinstance(index1, FAISS.IndexFlatL2):
|
50 |
+
# Handle simple flat indices
|
51 |
+
d = index1.d
|
52 |
+
# Extract vectors from both indices
|
53 |
+
xb1 = FAISS.rev_swig_ptr(index1.xb.data(), index1.ntotal * d)
|
54 |
+
xb2 = FAISS.rev_swig_ptr(index2.xb.data(), index2.ntotal * d)
|
55 |
+
|
56 |
+
# Combine vectors
|
57 |
+
xb_combined = np.vstack((xb1, xb2))
|
58 |
+
|
59 |
+
# Create a new index and add combined vectors
|
60 |
+
new_index = FAISS.IndexFlatL2(d)
|
61 |
+
new_index.add(xb_combined)
|
62 |
+
return new_index
|
63 |
+
|
64 |
+
elif isinstance(index1, FAISS.IndexIVFFlat):
|
65 |
+
# Handle quantized indices (IndexIVFFlat)
|
66 |
+
d = index1.d
|
67 |
+
nlist = index1.nlist
|
68 |
+
quantizer = FAISS.IndexFlatL2(d) # Re-create the appropriate quantizer
|
69 |
+
|
70 |
+
# Create a new index with the same configuration
|
71 |
+
new_index = FAISS.IndexIVFFlat(quantizer, d, nlist, FAISS.METRIC_L2)
|
72 |
+
|
73 |
+
# If the indices are already trained, you can directly add the vectors
|
74 |
+
# Otherwise, you may need to train new_index using a representative subset of vectors
|
75 |
+
vecs1 = FAISS.rev_swig_ptr(index1.xb.data(), index1.ntotal * d)
|
76 |
+
vecs2 = FAISS.rev_swig_ptr(index2.xb.data(), index2.ntotal * d)
|
77 |
+
new_index.add(vecs1)
|
78 |
+
new_index.add(vecs2)
|
79 |
+
return new_index
|
80 |
+
|
81 |
+
else:
|
82 |
+
raise TypeError("Index type not supported for merging in this function")
|
83 |
+
|
84 |
+
|
85 |
+
def get_pdf_text(pdf_docs):
|
86 |
+
text = ""
|
87 |
+
for pdf in pdf_docs:
|
88 |
+
pdf_reader = PdfReader(pdf)
|
89 |
+
for page in pdf_reader.pages:
|
90 |
+
text += page.extract_text()
|
91 |
+
return text
|
92 |
+
|
93 |
+
|
94 |
+
def get_text_chunks(text):
|
95 |
+
text_splitter = CharacterTextSplitter(
|
96 |
+
separator="\n",
|
97 |
+
chunk_size=1000,
|
98 |
+
chunk_overlap=200,
|
99 |
+
length_function=len
|
100 |
+
)
|
101 |
+
chunks = text_splitter.split_text(text)
|
102 |
+
return chunks
|
103 |
+
|
104 |
+
|
105 |
+
def get_faiss_vectorstore(text_chunks):
|
106 |
+
if sst.openai:
|
107 |
+
my_embeddings = OpenAIEmbeddings()
|
108 |
+
else:
|
109 |
+
my_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
|
110 |
+
vectorstore = FAISS.from_texts(texts=text_chunks, embedding=my_embeddings)
|
111 |
+
return vectorstore
|
112 |
+
|
113 |
+
|
114 |
+
def get_conversation_chain(vectorstore):
|
115 |
+
if sst.openai:
|
116 |
+
llm = ChatOpenAI()
|
117 |
+
else:
|
118 |
+
llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature": 0.5, "max_length": 512})
|
119 |
+
|
120 |
+
memory = ConversationBufferMemory(
|
121 |
+
memory_key='chat_history', return_messages=True)
|
122 |
+
conversation_chain = ConversationalRetrievalChain.from_llm(
|
123 |
+
llm=llm,
|
124 |
+
retriever=vectorstore.as_retriever(),
|
125 |
+
memory=memory
|
126 |
+
)
|
127 |
+
return conversation_chain
|
128 |
+
|
129 |
+
|
130 |
+
def handle_userinput(user_question):
|
131 |
+
response = sst.conversation({'question': user_question})
|
132 |
+
sst.chat_history = response['chat_history']
|
133 |
+
|
134 |
+
for i, message in enumerate(sst.chat_history):
|
135 |
+
# Display user message
|
136 |
+
if i % 2 == 0:
|
137 |
+
st.write(user_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
|
138 |
+
else:
|
139 |
+
print(message)
|
140 |
+
# Display AI response
|
141 |
+
st.write(bot_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
|
142 |
+
# Display source document information if available in the message
|
143 |
+
if hasattr(message, 'source') and message.source:
|
144 |
+
st.write(f"Source Document: {message.source}", unsafe_allow_html=True)
|
145 |
+
|
146 |
+
|
147 |
+
if True:
|
148 |
+
BASE_URL = "https://api.vectara.io/v1"
|
149 |
+
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
|
150 |
+
OPENAI_ORG_ID = os.environ["OPENAI_ORG_ID"]
|
151 |
+
PINECONE_API_KEY = os.environ["PINECONE_API_KEY_LCBIM"]
|
152 |
+
HUGGINGFACEHUB_API_TOKEN = os.environ["HUGGINGFACEHUB_API_TOKEN"]
|
153 |
+
VECTARA_API_KEY = os.environ["VECTARA_API_KEY"]
|
154 |
+
VECTARA_CUSTOMER_ID = os.environ["VECTARA_CUSTOMER_ID"]
|
155 |
+
headers = {"Authorization": f"Bearer {VECTARA_API_KEY}", "Content-Type": "application/json"}
|
156 |
+
|
157 |
+
|
158 |
+
def main():
|
159 |
+
st.set_page_config(page_title="Anna Seiler Haus KI-Assistent", page_icon=":hospital:")
|
160 |
+
st.write(css, unsafe_allow_html=True)
|
161 |
+
if "conversation" not in sst:
|
162 |
+
sst.conversation = None
|
163 |
+
if "chat_history" not in sst:
|
164 |
+
sst.chat_history = None
|
165 |
+
if "page" not in sst:
|
166 |
+
sst.page = "home"
|
167 |
+
if "openai" not in sst:
|
168 |
+
sst.openai = True
|
169 |
+
if "login" not in sst:
|
170 |
+
sst.login = False
|
171 |
+
if 'submitted_user_query' not in sst:
|
172 |
+
sst.submitted_user_query = ''
|
173 |
+
if 'submitted_user_safe' not in sst:
|
174 |
+
sst.submitted_user_safe = ''
|
175 |
+
if 'submitted_user_load' not in sst:
|
176 |
+
sst.submitted_user_load = ''
|
177 |
+
|
178 |
+
def submit_user_query():
|
179 |
+
sst.submitted_user_query = sst.widget_user_query
|
180 |
+
sst.widget_user_query = ''
|
181 |
+
|
182 |
+
def submit_user_safe():
|
183 |
+
sst.submitted_user_safe = sst.widget_user_safe
|
184 |
+
sst.widget_user_safe = ''
|
185 |
+
if "vectorstore" in sst:
|
186 |
+
# faiss_name = str(datetime.now().strftime("%Y%m%d%H%M%S")) + "faiss_index"
|
187 |
+
faiss_utils.save_local(sst.vectorstore, path=sst.submitted_user_safe)
|
188 |
+
st.sidebar.success("saved")
|
189 |
+
else:
|
190 |
+
st.sidebar.warning("No embeddings to save. Please process documents first.")
|
191 |
+
|
192 |
+
def submit_user_load():
|
193 |
+
sst.submitted_user_load = sst.widget_user_load
|
194 |
+
sst.widget_user_load = ''
|
195 |
+
if os.path.exists(sst.submitted_user_load):
|
196 |
+
new_db = faiss_utils.load_vectorstore(f"{sst.submitted_user_load}/faiss_index.index")
|
197 |
+
if "vectorstore" in sst:
|
198 |
+
if new_db is not None: # Check if this is working
|
199 |
+
sst.vectorstore.merge_from(new_db)
|
200 |
+
sst.conversation = get_conversation_chain(sst.vectorstore)
|
201 |
+
st.sidebar.success("faiss loaded")
|
202 |
+
else:
|
203 |
+
if new_db is not None: # Check if this is working
|
204 |
+
sst.vectorstore = new_db
|
205 |
+
sst.conversation = get_conversation_chain(new_db)
|
206 |
+
st.sidebar.success("faiss loaded")
|
207 |
+
else:
|
208 |
+
st.sidebar.warning("Couldn't load/find embeddings")
|
209 |
+
|
210 |
+
st.header("Anna Seiler Haus KI-Assistent ASH :hospital:")
|
211 |
+
if st.text_input("ASK_ASH_PASSWORD: ", type="password") == ASK_ASH_PASSWORD:
|
212 |
+
|
213 |
+
#user_question = st.text_input("Ask a question about your documents:", key="user_query", on_change=handle_query)
|
214 |
+
st.text_input('Ask a question about your documents:', key='widget_user_query', on_change=submit_user_query)
|
215 |
+
#sst.openai = st.toggle(label="use openai?")
|
216 |
+
|
217 |
+
if sst.submitted_user_query:
|
218 |
+
if "vectorstore" in sst:
|
219 |
+
handle_userinput(sst.submitted_user_query)
|
220 |
+
else:
|
221 |
+
st.warning("no vectorstore loaded.")
|
222 |
+
|
223 |
+
with st.sidebar:
|
224 |
+
st.subheader("Your documents")
|
225 |
+
pdf_docs = st.file_uploader("Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
|
226 |
+
if st.button("Process"):
|
227 |
+
with st.spinner("Processing"):
|
228 |
+
vec = get_faiss_vectorstore(get_text_chunks(get_pdf_text(pdf_docs)))
|
229 |
+
sst.vectorstore = vec
|
230 |
+
sst.conversation = get_conversation_chain(vec)
|
231 |
+
st.success("embedding complete")
|
232 |
+
|
233 |
+
st.text_input('Safe Embeddings to: (copy path of folder)', key='widget_user_safe',
|
234 |
+
on_change=submit_user_safe)
|
235 |
+
|
236 |
+
st.text_input('Load Embeddings from: (copy path of folder)', key='widget_user_load',
|
237 |
+
on_change=submit_user_load)
|
238 |
+
|
239 |
+
|
240 |
+
if __name__ == '__main__':
|
241 |
+
sst = st.session_state
|
242 |
+
ASK_ASH_PASSWORD = os.environ["ASK_ASH_PASSWORD"]
|
243 |
+
main()
|
classify_app.py
ADDED
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import os
|
3 |
+
# import openai
|
4 |
+
from PyPDF2 import PdfReader
|
5 |
+
from openai import OpenAI
|
6 |
+
from langchain.chat_models import ChatOpenAI
|
7 |
+
|
8 |
+
ASK_ASH_PASSWORD = os.environ["ASK_ASH_PASSWORD"]
|
9 |
+
|
10 |
+
|
11 |
+
def gpt4_new(prompt_text):
|
12 |
+
client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))
|
13 |
+
response = client.chat.completions.create(
|
14 |
+
model="gpt-4",
|
15 |
+
messages=[{"role": "system",
|
16 |
+
"content": "Du bist eine Maschine, auf Grund des Texts von PDF-Dokumenten,"
|
17 |
+
"das Dokument in vorgegebene Kategorien klassifiziert."
|
18 |
+
"Du gibts möglichst kurze Antworten, am besten ein Wort"
|
19 |
+
"Du gibst keine Erklärungen oder Begründungen. "
|
20 |
+
"Du klassifizierst nur nach den vorgegebenen Kategorien."
|
21 |
+
"Wenn ein Dokument partout nicht klassifizierbar ist, "
|
22 |
+
"antwortest du mit '<no classification>'"},
|
23 |
+
{"role": "user", "content": prompt_text}])
|
24 |
+
return response.choices[0].message.content
|
25 |
+
|
26 |
+
|
27 |
+
# Define a function to ask a question to GPT-4
|
28 |
+
def ask_gpt4(question):
|
29 |
+
print(question) # we don't have to submit the question?
|
30 |
+
try:
|
31 |
+
# Use the chat function to send a message and get a response
|
32 |
+
response = ChatOpenAI()
|
33 |
+
# Extract the response text
|
34 |
+
return response["choices"][0]["message"]["content"]
|
35 |
+
except Exception as e:
|
36 |
+
# Handle exceptions that may occur during the API call
|
37 |
+
return str(e)
|
38 |
+
|
39 |
+
|
40 |
+
def process_prompts_and_save(my_prompts):
|
41 |
+
# Ensure the responses list is empty initially
|
42 |
+
responses = []
|
43 |
+
|
44 |
+
# Loop through each prompt in the list
|
45 |
+
for prompt in my_prompts:
|
46 |
+
try:
|
47 |
+
# ADD LOGIC TO READ FILE AND CLASSIFY
|
48 |
+
# Generate response for each prompt and append to the list
|
49 |
+
response = ask_gpt4(prompt)
|
50 |
+
sol = f"{prompt}\n\n{response}\n\n\n\n"
|
51 |
+
print(sol)
|
52 |
+
responses.append(sol)
|
53 |
+
except Exception as e:
|
54 |
+
# In case of an error, log the error with the prompt
|
55 |
+
responses.append(f"{prompt}\n\nError:{str(e)}\n\n\n\n")
|
56 |
+
|
57 |
+
# Writing all responses to a text file
|
58 |
+
with open('gpt4_responses.txt', 'w', encoding='utf-8') as file:
|
59 |
+
file.writelines(responses)
|
60 |
+
|
61 |
+
|
62 |
+
def get_pdfs_text(pdf_docs):
|
63 |
+
text = ""
|
64 |
+
for pdf in pdf_docs:
|
65 |
+
pdf_reader = PdfReader(pdf)
|
66 |
+
for page in pdf_reader.pages:
|
67 |
+
text += page.extract_text()
|
68 |
+
return text
|
69 |
+
|
70 |
+
|
71 |
+
def get_pdf_text(pdf_document):
|
72 |
+
text = ""
|
73 |
+
pdf_reader = PdfReader(pdf_document)
|
74 |
+
for page in pdf_reader.pages:
|
75 |
+
text += page.extract_text()
|
76 |
+
return text
|
77 |
+
|
78 |
+
|
79 |
+
def json_open(filename):
|
80 |
+
with open(filename, "r") as f:
|
81 |
+
mydata = f.read()
|
82 |
+
return mydata
|
83 |
+
|
84 |
+
|
85 |
+
def main():
|
86 |
+
st.title("Doc Classifier")
|
87 |
+
l, r = st.columns(2)
|
88 |
+
if st.toggle("show README"):
|
89 |
+
st.subheader("Funktion: ")
|
90 |
+
st.write("der Doc Classifier von Elia Wäfler kann einige der BIM2FM Dokumente")
|
91 |
+
st.write("des ASH nach Disziplin, Doc typ. und Geschoss (später KBOB) klassifizieren.")
|
92 |
+
st.write("lade ein oder mehrere PDF-Dokumente hoch, um es auszuprobieren.")
|
93 |
+
st.write("Feedback und Bugs gerne an elia.waefler@insel.ch")
|
94 |
+
st.write("Vielen Dank.")
|
95 |
+
st.write("")
|
96 |
+
with l:
|
97 |
+
st.subheader("Limitationen: ")
|
98 |
+
st.write("bisher nur PDFs")
|
99 |
+
st.write("nur Disziplin, Doc typ. und Geschoss")
|
100 |
+
st.write("macht teilweise Fehler, vor allem bei Koordination, Datennetz usw, (unklare Disziplinen)")
|
101 |
+
st.write("")
|
102 |
+
with r:
|
103 |
+
st.subheader("geplante Erweiterungen:")
|
104 |
+
st.write("Text Beschreibung wird von AI hinzugefügt")
|
105 |
+
st.write("jpg, bilder, tabellen, .xlsx, .docx alles möglich, nicht nur PDF/Text")
|
106 |
+
st.write("Ecodomus API einbinden, um alle Dokumente zu überprüfen.")
|
107 |
+
|
108 |
+
if st.text_input("ASK_ASH_PASSWORD: ", type="password") == ASK_ASH_PASSWORD:
|
109 |
+
uploaded_files = st.file_uploader("PDF Dokument", accept_multiple_files=True)
|
110 |
+
#print(uploaded_file)
|
111 |
+
#print(uploaded_file.name)
|
112 |
+
|
113 |
+
if st.button("classify KBOB!"):
|
114 |
+
if uploaded_files is not None:
|
115 |
+
with st.container():
|
116 |
+
# col1, col2, col3, col4, col5 = st.columns(5)
|
117 |
+
col1, col2, col3 = st.columns(3)
|
118 |
+
all_metadata = []
|
119 |
+
with col1:
|
120 |
+
st.write("Disziplin")
|
121 |
+
st.write(f"")
|
122 |
+
with col2:
|
123 |
+
st.write("Dokumententyp")
|
124 |
+
st.write(f"")
|
125 |
+
with col3:
|
126 |
+
st.write("Geschoss")
|
127 |
+
st.write(f"")
|
128 |
+
|
129 |
+
for file in uploaded_files:
|
130 |
+
metadata = [file.name]
|
131 |
+
with col1:
|
132 |
+
with st.spinner("GPT4 at work"):
|
133 |
+
pdf_text = str(get_pdf_text(file))
|
134 |
+
prompt_1 = auftrag_0 + auftrag_1_disziplin + str(Baubranchen_Disziplinen) + pdf_text
|
135 |
+
answer_1 = gpt4_new(prompt_1)
|
136 |
+
print(prompt_1)
|
137 |
+
metadata.append(answer_1)
|
138 |
+
st.write(answer_1)
|
139 |
+
|
140 |
+
with col2:
|
141 |
+
with st.spinner("GPT4 at work"):
|
142 |
+
prompt_2 = auftrag_0 + auftrag_1_type + str(Dokumententypen) + pdf_text
|
143 |
+
answer_2 = gpt4_new(prompt_2)
|
144 |
+
print(prompt_2)
|
145 |
+
metadata.append(answer_2)
|
146 |
+
|
147 |
+
st.write(answer_2)
|
148 |
+
|
149 |
+
with col3:
|
150 |
+
with st.spinner("GPT4 at work"):
|
151 |
+
prompt_3 = auftrag_0 + auftrag_1_ge + str(ASH_Geschosse) + pdf_text
|
152 |
+
answer_3 = gpt4_new(prompt_3)
|
153 |
+
print(prompt_3)
|
154 |
+
metadata.append(answer_2)
|
155 |
+
|
156 |
+
st.write(answer_3)
|
157 |
+
|
158 |
+
all_metadata.append(metadata)
|
159 |
+
|
160 |
+
metadata_filename = "ai_generated_metadata.txt"
|
161 |
+
with open(metadata_filename, 'w', encoding='utf-8') as f:
|
162 |
+
for line in all_metadata:
|
163 |
+
f.writelines("\n")
|
164 |
+
for item in line:
|
165 |
+
f.writelines(item)
|
166 |
+
f.writelines(";")
|
167 |
+
|
168 |
+
f.writelines("\n")
|
169 |
+
|
170 |
+
st.success("classified, saved")
|
171 |
+
st.download_button(f"Download Metadata", json_open(metadata_filename), file_name=metadata_filename)
|
172 |
+
else:
|
173 |
+
st.warning("no file")
|
174 |
+
|
175 |
+
|
176 |
+
if __name__ == "__main__":
|
177 |
+
#prompts = ["classify the document, tell me the ", "hello"]
|
178 |
+
#process_prompts_and_save(prompts)
|
179 |
+
auftrag_0 = "Klassifiziere dieses Dokument nach "
|
180 |
+
auftrag_1_disziplin = "diesen 'Baubranchen Disziplinen': "
|
181 |
+
auftrag_1_type = "diesen 'Dokumententypen': "
|
182 |
+
auftrag_1_ge = "diesen 'Geschossen': "
|
183 |
+
Baubranchen_Disziplinen = ['A-Architektur', 'B-Bauphysik', 'C-Rohrpostanlagen', 'D-Datennetz', 'E-Elektroanlagen',
|
184 |
+
'F-Fassadenplanung', 'G-Küche', 'H-Heizung', 'I-Innenausbau', 'K-Kälte', 'L-Lüftung',
|
185 |
+
'M-Medizintechnik', 'N-Fördertechnik', 'O-Gebäudebetrieb', 'P-Sprinkler',
|
186 |
+
'Q-Brandschutz', 'R-Koordination', 'S-Sanitär', 'T-Tragwerksplanung', 'W-Informatik',
|
187 |
+
'Z-Lichtplanung']
|
188 |
+
auftrag_2 = "gib nur den am besten passendsten Eintrag zurück. " \
|
189 |
+
"Keine weiteren Ausführungen oder Erklärungen. " \
|
190 |
+
"Antworte am besten in einem Wort. " \
|
191 |
+
"Hier der Dokumenteninhalt: "
|
192 |
+
Dokumententypen = ['Fotodokumentation', 'Projektdokumentation (PD)', 'Objektdokumentation (OD)',
|
193 |
+
'Prozessdokumentation', 'Fachdokumentation', 'Anlagedokumentation']
|
194 |
+
ASH_Geschosse = ['U4', 'U3', 'U2', 'U1',
|
195 |
+
'A', 'B', 'C', 'D', 'E', 'F', 'G']
|
196 |
+
#print(str(Baubranchen_Disziplinen))
|
197 |
+
main()
|
ingest.py
ADDED
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from PyPDF2 import PdfReader
|
2 |
+
from langchain.text_splitter import CharacterTextSplitter
|
3 |
+
import tabula
|
4 |
+
import io
|
5 |
+
import fitz # PyMuPDF
|
6 |
+
import pdfplumber
|
7 |
+
|
8 |
+
|
9 |
+
def get_pdf_tables(pdf_bytes):
|
10 |
+
"""
|
11 |
+
Extracts tables from a PDF file loaded directly from bytes.
|
12 |
+
|
13 |
+
Args:
|
14 |
+
pdf_bytes (bytes): The byte content of the PDF file.
|
15 |
+
|
16 |
+
Returns:
|
17 |
+
List[pandas.DataFrame]: A list of DataFrames, each representing a table extracted from the PDF.
|
18 |
+
"""
|
19 |
+
tables = []
|
20 |
+
with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
|
21 |
+
for page in pdf.pages:
|
22 |
+
# Extract tables from the current page
|
23 |
+
page_tables = page.extract_tables()
|
24 |
+
for table in page_tables:
|
25 |
+
# Convert table to a DataFrame and append to the list
|
26 |
+
tables.append(table)
|
27 |
+
|
28 |
+
# Optionally convert lists of lists (tables) to pandas DataFrames
|
29 |
+
import pandas as pd
|
30 |
+
dataframes = [pd.DataFrame(table[1:], columns=table[0]) for table in tables if table]
|
31 |
+
return dataframes
|
32 |
+
|
33 |
+
|
34 |
+
def get_pdf_images(pdf_bytes):
|
35 |
+
"""
|
36 |
+
Extracts images and captures screenshots of each page from a given PDF's bytes.
|
37 |
+
|
38 |
+
Args:
|
39 |
+
pdf_bytes (bytes): The byte content of the PDF file.
|
40 |
+
|
41 |
+
Returns:
|
42 |
+
List[bytes]: A list of image bytes extracted from the PDF, including screenshots of each page.
|
43 |
+
"""
|
44 |
+
images = []
|
45 |
+
pdf_stream = io.BytesIO(pdf_bytes)
|
46 |
+
doc = fitz.open("pdf", pdf_stream.read())
|
47 |
+
|
48 |
+
for page_num, page in enumerate(doc):
|
49 |
+
# Take a screenshot of the current page
|
50 |
+
pix = page.get_pixmap() # This line captures the page as an image
|
51 |
+
img_bytes = pix.tobytes("png") # Save the pixmap as PNG bytes
|
52 |
+
images.append(img_bytes) # Append the screenshot to the list of images
|
53 |
+
|
54 |
+
# Extract embedded images
|
55 |
+
for img_index, img in enumerate(page.get_images(full=True)):
|
56 |
+
xref = img[0]
|
57 |
+
base_image = doc.extract_image(xref)
|
58 |
+
image_bytes = base_image["image"]
|
59 |
+
images.append(image_bytes)
|
60 |
+
|
61 |
+
doc.close()
|
62 |
+
return images
|
63 |
+
|
64 |
+
|
65 |
+
def get_pdf_old_tables(pdf_bytes):
|
66 |
+
"""
|
67 |
+
Extracts tables from a given PDF's bytes using Tabula.
|
68 |
+
Args:
|
69 |
+
pdf_bytes (bytes): The byte content of the PDF file.
|
70 |
+
|
71 |
+
Returns:
|
72 |
+
List[pandas.DataFrame]: A list of DataFrames, each representing a table extracted from the PDF.
|
73 |
+
"""
|
74 |
+
pdf_stream = io.BytesIO(pdf_bytes)
|
75 |
+
# Read PDF into list of DataFrame
|
76 |
+
tables = tabula.read_pdf(pdf_stream, pages='all', multiple_tables=True)
|
77 |
+
return tables
|
78 |
+
|
79 |
+
|
80 |
+
def get_pdf_text(pdf_docs):
|
81 |
+
text = ""
|
82 |
+
if type(pdf_docs) == list:
|
83 |
+
for pdf in pdf_docs:
|
84 |
+
pdf_reader = PdfReader(pdf)
|
85 |
+
for page in pdf_reader.pages:
|
86 |
+
text += page.extract_text()
|
87 |
+
else:
|
88 |
+
pdf_reader = PdfReader(pdf_docs)
|
89 |
+
for page in pdf_reader.pages:
|
90 |
+
text += page.extract_text()
|
91 |
+
return text
|
92 |
+
|
93 |
+
|
94 |
+
def get_text_chunks(text):
|
95 |
+
text_splitter = CharacterTextSplitter(
|
96 |
+
separator="\n",
|
97 |
+
chunk_size=1000,
|
98 |
+
chunk_overlap=200,
|
99 |
+
length_function=len
|
100 |
+
)
|
101 |
+
chunks = text_splitter.split_text(text)
|
102 |
+
return chunks
|
103 |
+
|
104 |
+
|
105 |
+
def extract_images_from_pdf_path(pdf_path):
|
106 |
+
doc = fitz.open(pdf_path)
|
107 |
+
images = []
|
108 |
+
for i in range(len(doc)):
|
109 |
+
for img in doc.get_page_images(i):
|
110 |
+
xref = img[0]
|
111 |
+
base = img[1]
|
112 |
+
img_data = doc.extract_image(xref)
|
113 |
+
img_bytes = img_data['image']
|
114 |
+
|
115 |
+
# open the image with PIL
|
116 |
+
from PIL import Image
|
117 |
+
import io
|
118 |
+
image = Image.open(io.BytesIO(img_bytes))
|
119 |
+
images.append(image)
|
120 |
+
|
121 |
+
return images
|
122 |
+
|
123 |
+
|
124 |
+
def get_tables_from_pdf_path(pdf_path):
|
125 |
+
# read_pdf will save the pdf table into Pandas Dataframe
|
126 |
+
tables = tabula.read_pdf(pdf_path, pages='all')
|
127 |
+
return tables
|
128 |
+
|
129 |
+
|
130 |
+
print(get_pdf_text("ISB-020-U3-W-E-01-B15100-005-000.pdf"))
|
my_1_reader.py
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MUSS AUFGERÄUMT WERDEN
|
2 |
+
|
3 |
+
import json
|
4 |
+
import os
|
5 |
+
import subprocess
|
6 |
+
import PyPDF2
|
7 |
+
import csv
|
8 |
+
import fitz # PyMuPDF
|
9 |
+
|
10 |
+
|
11 |
+
def extract_text_from_pdf(pdf_path):
|
12 |
+
"""
|
13 |
+
Extracts all text from a PDF file.
|
14 |
+
|
15 |
+
:param pdf_path: Path to the PDF file.
|
16 |
+
:return: Extracted text as a string.
|
17 |
+
"""
|
18 |
+
# Open the PDF file
|
19 |
+
doc = fitz.open(pdf_path)
|
20 |
+
|
21 |
+
# Initialize an empty string to hold the text
|
22 |
+
text = ''
|
23 |
+
|
24 |
+
# Iterate through each page in the PDF
|
25 |
+
for page_num in range(len(doc)):
|
26 |
+
# Get a page
|
27 |
+
page = doc.load_page(page_num)
|
28 |
+
|
29 |
+
# Extract text from the page and add it to the result
|
30 |
+
text += page.get_text()
|
31 |
+
|
32 |
+
# Close the document
|
33 |
+
doc.close()
|
34 |
+
|
35 |
+
return text
|
36 |
+
|
37 |
+
|
38 |
+
def read_pdfs_from_folder(folder_path):
|
39 |
+
"""
|
40 |
+
Reads all PDF files in the specified folder using PdfReader and extracts their text.
|
41 |
+
|
42 |
+
Parameters:
|
43 |
+
- folder_path: The path to the folder containing PDF files.
|
44 |
+
|
45 |
+
Returns:
|
46 |
+
- A dictionary with file names as keys and their extracted text as values.
|
47 |
+
"""
|
48 |
+
pdf_texts = {}
|
49 |
+
for filename in os.listdir(folder_path):
|
50 |
+
if filename.endswith('.pdf'):
|
51 |
+
file_path = os.path.join(folder_path, filename)
|
52 |
+
with open(file_path, 'rb') as pdf_file:
|
53 |
+
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
54 |
+
text = ''
|
55 |
+
for page in pdf_reader.pages:
|
56 |
+
try:
|
57 |
+
text += page.extract_text()
|
58 |
+
except UnicodeDecodeError as e:
|
59 |
+
print(e)
|
60 |
+
for c in text:
|
61 |
+
if c in ["ä", "Ä"]:
|
62 |
+
text = text[:text.index(c)] + "ae" + text[text.index(c)+1:]
|
63 |
+
if c in ["ö", "Ö"]:
|
64 |
+
text = text[:text.index(c)] + "oe" + text[text.index(c)+1:]
|
65 |
+
if c in ["ü", "Ü"]:
|
66 |
+
text = text[:text.index(c)] + "ue" + text[text.index(c)+1:]
|
67 |
+
if c in [",", ";", "\\", '"']:
|
68 |
+
text = text[:text.index(c)] + "_" + text[text.index(c)+1:]
|
69 |
+
if c in ["/n", "\n"]:
|
70 |
+
text = text[:text.index(c)] + "<newline>" + text[text.index(c) + 1:]
|
71 |
+
pdf_texts[filename] = text
|
72 |
+
return pdf_texts
|
73 |
+
|
74 |
+
|
75 |
+
def read_csv_lines_as_strings(filename):
|
76 |
+
"""
|
77 |
+
Opens a CSV file and returns each line as a string in a list.
|
78 |
+
|
79 |
+
Parameters:
|
80 |
+
- filename: The path to the CSV file.
|
81 |
+
|
82 |
+
Returns:
|
83 |
+
- A list of strings, each representing a line from the CSV file.
|
84 |
+
"""
|
85 |
+
lines_as_strings = []
|
86 |
+
with open(filename, newline='') as csvfile:
|
87 |
+
try:
|
88 |
+
reader = csv.reader(csvfile)
|
89 |
+
for row in reader:
|
90 |
+
# Convert the row (a list of values) back into a comma-separated string
|
91 |
+
line_as_string = ','.join(row)
|
92 |
+
lines_as_strings.append(line_as_string)
|
93 |
+
except UnicodeDecodeError as e:
|
94 |
+
print(e)
|
95 |
+
return lines_as_strings
|
96 |
+
|
97 |
+
|
98 |
+
# Function to load data from JSON files
|
99 |
+
def load_data(filename):
|
100 |
+
with open(filename, 'r') as file:
|
101 |
+
try:
|
102 |
+
return json.load(file)
|
103 |
+
except UnicodeDecodeError as err:
|
104 |
+
print(err)
|
105 |
+
return {}
|
106 |
+
|
107 |
+
|
108 |
+
def find_and_open_file(filename, start_directory):
|
109 |
+
"""
|
110 |
+
Attempts to open a file with the given filename starting from the specified directory.
|
111 |
+
If the file is not found, searches recursively in all subfolders. Works across macOS, Linux, and Windows.
|
112 |
+
"""
|
113 |
+
for root, dirs, files in os.walk(start_directory):
|
114 |
+
if filename in files:
|
115 |
+
filepath = os.path.join(root, filename)
|
116 |
+
print(f"File found: {filepath}")
|
117 |
+
return filepath
|
118 |
+
print(f"File {filename} not found.")
|
119 |
+
return None
|
120 |
+
|
121 |
+
|
122 |
+
def open_file(filepath):
|
123 |
+
"""
|
124 |
+
Opens the file with the default application, based on the operating system.
|
125 |
+
"""
|
126 |
+
if os.path.exists(filepath):
|
127 |
+
if os.name == 'posix': # Linux, macOS, etc.
|
128 |
+
subprocess.call(('open', filepath))
|
129 |
+
elif os.name == 'nt': # Windows
|
130 |
+
os.startfile(filepath)
|
131 |
+
else:
|
132 |
+
print(f"Cannot open file on this operating system: {filepath}")
|
133 |
+
else:
|
134 |
+
print(f"File does not exist: {filepath}")
|
135 |
+
|
136 |
+
|
137 |
+
def list_folders_files_recursive(path, depth=0):
|
138 |
+
"""
|
139 |
+
Recursively lists all folders and files within the specified path, including subfolders.
|
140 |
+
|
141 |
+
Parameters:
|
142 |
+
- path: The directory path to list contents from.
|
143 |
+
- depth: The current depth of recursion (used for indentation in print statements).
|
144 |
+
|
145 |
+
Returns:
|
146 |
+
- None
|
147 |
+
"""
|
148 |
+
# Ensure the provided path is a directory
|
149 |
+
if not os.path.isdir(path):
|
150 |
+
print(f"The provided path '{path}' is not a valid directory.")
|
151 |
+
return
|
152 |
+
|
153 |
+
indent = ' ' * depth # Indentation based on recursion depth
|
154 |
+
folders, files = [], []
|
155 |
+
|
156 |
+
# List all entries in the directory
|
157 |
+
for entry in os.listdir(path):
|
158 |
+
full_path = os.path.join(path, entry)
|
159 |
+
if os.path.isdir(full_path):
|
160 |
+
folders.append(entry)
|
161 |
+
print(f"{indent}Folder: {entry}")
|
162 |
+
# Recursively list subfolders and files
|
163 |
+
list_folders_files_recursive(full_path, depth + 1)
|
164 |
+
elif os.path.isfile(full_path):
|
165 |
+
files.append(entry)
|
166 |
+
|
167 |
+
for f in files:
|
168 |
+
print(f"{indent}File: {f}")
|
169 |
+
|
170 |
+
|
171 |
+
def list_folders_files(path):
|
172 |
+
"""
|
173 |
+
Lists all folders and files within the specified path.
|
174 |
+
|
175 |
+
Parameters:
|
176 |
+
- path: The directory path to list contents from.
|
177 |
+
|
178 |
+
Returns:
|
179 |
+
- A tuple of two lists: (folders, files).
|
180 |
+
"""
|
181 |
+
folders = []
|
182 |
+
files = []
|
183 |
+
|
184 |
+
# Ensure the provided path is a directory
|
185 |
+
if not os.path.isdir(path):
|
186 |
+
print(f"The provided path '{path}' is not a valid directory.")
|
187 |
+
return folders, files
|
188 |
+
|
189 |
+
# List all entries in the directory
|
190 |
+
for entry in os.listdir(path):
|
191 |
+
full_path = os.path.join(path, entry)
|
192 |
+
if os.path.isdir(full_path):
|
193 |
+
folders.append(entry)
|
194 |
+
elif os.path.isfile(full_path):
|
195 |
+
files.append(entry)
|
196 |
+
|
197 |
+
return folders, files
|
198 |
+
|
199 |
+
|
200 |
+
if __name__ == "__main__":
|
201 |
+
print("here are all functions that read files")
|
my_2_sim_search.py
ADDED
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import my_new_openai
|
2 |
+
import my_1_writer
|
3 |
+
import json
|
4 |
+
import numpy as np
|
5 |
+
|
6 |
+
|
7 |
+
# sim search with dot_product and lin_distance
|
8 |
+
# the newly vectorized TERM will be added to the database
|
9 |
+
# database = .json file
|
10 |
+
def sim_search_load_db(database, term, add_to_db=True, debug=False):
|
11 |
+
if type(term) == str:
|
12 |
+
print("str")
|
13 |
+
vector1 = my_new_openai.vectorize_data(term)
|
14 |
+
elif type(term) == list:
|
15 |
+
print("list")
|
16 |
+
vector1 = term
|
17 |
+
else:
|
18 |
+
print("invalid search_term/search_vector format")
|
19 |
+
return
|
20 |
+
with open(database, "r") as f:
|
21 |
+
table = json.load(f)
|
22 |
+
sim_search_dict = {}
|
23 |
+
for key in table.keys():
|
24 |
+
vector2 = table[key]
|
25 |
+
if debug:
|
26 |
+
print("")
|
27 |
+
print(f"{vector1}")
|
28 |
+
print(f"{vector2}")
|
29 |
+
print(f"doing dot product for {key} and {term}")
|
30 |
+
dp = np.dot(vector1, vector2)
|
31 |
+
distance = np.linalg.norm(np.array(vector1) - np.array(vector2))
|
32 |
+
if debug:
|
33 |
+
print(f"the dp is {dp}")
|
34 |
+
print(f"the distance is{distance}")
|
35 |
+
print("")
|
36 |
+
print("")
|
37 |
+
print("")
|
38 |
+
sim_search_dict[key] = dp * distance
|
39 |
+
|
40 |
+
# sort with the biggest similarity
|
41 |
+
sorted_table = dict(sorted(sim_search_dict.items(), key=lambda item: item[1]), reversed=True)
|
42 |
+
|
43 |
+
if debug:
|
44 |
+
for key, value in sorted_table[:5]:
|
45 |
+
print(f"{key}: {value}")
|
46 |
+
if add_to_db:
|
47 |
+
|
48 |
+
if term in table.keys():
|
49 |
+
print("the search term is in the database!")
|
50 |
+
# add the newly vectorized term to the words, if not already in the vector table
|
51 |
+
else:
|
52 |
+
if database != "session/my_words_vec_table.json":
|
53 |
+
database = "session/my_vecs.json"
|
54 |
+
# table = load_df(database) # ??
|
55 |
+
table[str(term)] = vector1
|
56 |
+
my_1_writer.safe_my_dict_as_json(database, table)
|
57 |
+
# first_key, first_value = list(sortedTable.items())[0]
|
58 |
+
print(f"the closest word to your input is: {list(sorted_table.keys())[0]}")
|
59 |
+
return sorted_table
|
60 |
+
|
61 |
+
|
62 |
+
def dot_p_to_1(database, vector1=0, analysis_filename=0):
|
63 |
+
|
64 |
+
with open(database, "r") as f:
|
65 |
+
table = json.load(f)
|
66 |
+
dot_product_to1 = {}
|
67 |
+
|
68 |
+
if vector1 == 0:
|
69 |
+
vector1 = [0.025515518153991442 for _ in range(1536)]
|
70 |
+
elif vector1 == 1:
|
71 |
+
vector1 = table[str(list(table.keys())[0])]
|
72 |
+
|
73 |
+
for key in table.keys():
|
74 |
+
dot_product_to1[key] = np.dot(vector1, table[key])
|
75 |
+
my_1_writer.safe_my_dict_as_json(analysis_filename, dot_product_to1)
|
76 |
+
print("dot p to 1 saved")
|
77 |
+
|
78 |
+
|
79 |
+
def lin_dist(database, vector1=0, analysis_filename=0):
|
80 |
+
with open(database, "r") as f:
|
81 |
+
table = json.load(f)
|
82 |
+
lin_dist_to_1 = {}
|
83 |
+
|
84 |
+
if vector1 == 0:
|
85 |
+
vector1 = [0.025515518153991442 for _ in range(1536)]
|
86 |
+
elif vector1 == 1:
|
87 |
+
vector1 = table[str(list(table.keys())[0])]
|
88 |
+
|
89 |
+
for key in table.keys():
|
90 |
+
lin_dist_to_1[key] = np.linalg.norm(np.array(vector1) - np.array(table[key]))
|
91 |
+
|
92 |
+
my_1_writer.safe_my_dict_as_json(analysis_filename, lin_dist_to_1)
|
93 |
+
print("lin dist to 1 saved")
|
94 |
+
|
95 |
+
|
96 |
+
def manhattan_dist(database, vector1=0, analysis_filename=0):
|
97 |
+
with open(database, "r") as f:
|
98 |
+
table = json.load(f)
|
99 |
+
manhattan_dist_to_1 = {}
|
100 |
+
|
101 |
+
if vector1 == 0:
|
102 |
+
vector1 = [0.025515518153991442 for _ in range(1536)]
|
103 |
+
elif vector1 == 1:
|
104 |
+
vector1 = table[str(list(table.keys())[0])]
|
105 |
+
|
106 |
+
for key in table.keys():
|
107 |
+
manhattan_dist_to_1[key] = sum(np.array(vector1) - np.array(table[key]))
|
108 |
+
|
109 |
+
my_1_writer.safe_my_dict_as_json(analysis_filename, manhattan_dist_to_1)
|
110 |
+
print("manhattan dist to 1 saved")
|
111 |
+
|
112 |
+
|
113 |
+
#vec_table
|
114 |
+
def sim_search_fly(vec_table, term, debug=False):
|
115 |
+
if debug:
|
116 |
+
print(type(vec_table))
|
117 |
+
print(type(term))
|
118 |
+
print(type(vec_table[list(vec_table.keys())[0]]))
|
119 |
+
print("vec table:")
|
120 |
+
print(vec_table[list(vec_table.keys())[5]][:4])
|
121 |
+
print("search term")
|
122 |
+
print(term[:4])
|
123 |
+
if type(term) == str:
|
124 |
+
print("str")
|
125 |
+
vector1 = my_new_openai.vectorize_data(term)
|
126 |
+
elif type(term) == list:
|
127 |
+
print("list")
|
128 |
+
vector1 = term
|
129 |
+
else:
|
130 |
+
print("invalid search_term/search_vector format")
|
131 |
+
return
|
132 |
+
|
133 |
+
sim_search_dict = {}
|
134 |
+
for key in vec_table.keys():
|
135 |
+
vector2 = vec_table[key]
|
136 |
+
if debug:
|
137 |
+
print("")
|
138 |
+
print(f"{vector1}")
|
139 |
+
print(f"{vector2}")
|
140 |
+
print(f"doing dot product for {key} and {term}")
|
141 |
+
if vector2[0] == vector2[1] and vector2[3] == vector2[4] and vector2[5] == vector2[6]:
|
142 |
+
dp = 200
|
143 |
+
else:
|
144 |
+
dp = np.dot(vector1, vector2)
|
145 |
+
#distance = np.linalg.norm(np.array(vector1) - np.array(vector2))
|
146 |
+
if debug:
|
147 |
+
print(f"the dp is {dp}")
|
148 |
+
#print(f"the distance is{distance}")
|
149 |
+
print("")
|
150 |
+
print("")
|
151 |
+
print("")
|
152 |
+
sim_search_dict[key] = dp #* distance
|
153 |
+
|
154 |
+
# sort with the biggest similarity
|
155 |
+
sorted_table = dict(sorted(sim_search_dict.items(), key=lambda item: item[1]), reversed=True)
|
156 |
+
|
157 |
+
if debug:
|
158 |
+
for key, value in sorted_table[:5]:
|
159 |
+
print(f"{key}: {value}")
|
160 |
+
|
161 |
+
# first_key, first_value = list(sortedTable.items())[0]
|
162 |
+
print(f"the closest word to your input is: {list(sorted_table.keys())[0]}")
|
163 |
+
return sorted_table
|
164 |
+
|
my_new_openai.py
ADDED
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from openai import OpenAI
|
3 |
+
import requests
|
4 |
+
import base64
|
5 |
+
from pydub import AudioSegment
|
6 |
+
from moviepy.editor import VideoFileClip
|
7 |
+
|
8 |
+
client = OpenAI()
|
9 |
+
|
10 |
+
|
11 |
+
def image_bytes_to_base64(image_bytes):
|
12 |
+
"""
|
13 |
+
Converts an image from bytes to a Base64 encoded string.
|
14 |
+
|
15 |
+
Args:
|
16 |
+
image_bytes (bytes): Byte content of the image.
|
17 |
+
|
18 |
+
Returns:
|
19 |
+
str: A Base64 encoded string of the image.
|
20 |
+
"""
|
21 |
+
return base64.b64encode(image_bytes).decode('utf-8')
|
22 |
+
|
23 |
+
|
24 |
+
def image_to_base64(image_path):
|
25 |
+
with open(image_path, "rb") as image_file:
|
26 |
+
return str(base64.b64encode(image_file.read()).decode('utf-8'))
|
27 |
+
|
28 |
+
|
29 |
+
def gpt4_new(prompt_text):
|
30 |
+
gpt_response = client.chat.completions.create(
|
31 |
+
model="gpt-4",
|
32 |
+
messages=[{"role": "system",
|
33 |
+
"content": "Du bist eine Maschine, die Dokumente klassifiziert."},
|
34 |
+
{"role": "user", "content": prompt_text}])
|
35 |
+
return gpt_response.choices[0].message.content
|
36 |
+
|
37 |
+
|
38 |
+
def vectorize_data(data_input):
|
39 |
+
# input can be list or string:
|
40 |
+
|
41 |
+
if isinstance(data_input, list):
|
42 |
+
# returning a dictionary
|
43 |
+
my_dict = {}
|
44 |
+
for item in data_input:
|
45 |
+
my_dict[str(item)] = client.embeddings.create(input=data_input,
|
46 |
+
model="text-embedding-ada-002").data[0].embedding
|
47 |
+
return my_dict
|
48 |
+
|
49 |
+
elif isinstance(data_input, str):
|
50 |
+
# returning just the vector
|
51 |
+
return client.embeddings.create(input=data_input, model="text-embedding-ada-002").data[0].embedding
|
52 |
+
|
53 |
+
else:
|
54 |
+
print("none")
|
55 |
+
|
56 |
+
|
57 |
+
def img_create(prompt="a nice house on the beach", download_path=""):
|
58 |
+
# to open, must download
|
59 |
+
my_url = client.images.generate(model="dall-e-3", prompt=prompt, size="1024x1024").data[0].url
|
60 |
+
if download_path:
|
61 |
+
my_image = requests.get(my_url)
|
62 |
+
if my_image.status_code == 200:
|
63 |
+
with open(download_path, 'wb') as f:
|
64 |
+
f.write(my_image.content)
|
65 |
+
else:
|
66 |
+
print("Failed to retrieve image")
|
67 |
+
return my_url
|
68 |
+
|
69 |
+
|
70 |
+
def img_to_text(img_url="", img_base64="", prompt="What’s in this image?", print_out=True):
|
71 |
+
if img_url:
|
72 |
+
img_desc_response = client.chat.completions.create(
|
73 |
+
model="gpt-4-turbo",
|
74 |
+
messages=[
|
75 |
+
{
|
76 |
+
"role": "user",
|
77 |
+
"content": [
|
78 |
+
{"type": "text", "text": prompt},
|
79 |
+
{
|
80 |
+
"type": "image_url",
|
81 |
+
"image_url": {
|
82 |
+
"url": img_url,
|
83 |
+
},
|
84 |
+
},
|
85 |
+
],
|
86 |
+
}
|
87 |
+
],
|
88 |
+
max_tokens=500,
|
89 |
+
)
|
90 |
+
if print_out:
|
91 |
+
print(img_desc_response.choices[0].message.content)
|
92 |
+
return img_desc_response.choices[0].message.content
|
93 |
+
elif img_base64:
|
94 |
+
headers = {
|
95 |
+
"Content-Type": "application/json",
|
96 |
+
"Authorization": f"Bearer {os.environ['OPENAI_API_KEY']}"
|
97 |
+
}
|
98 |
+
payload = {
|
99 |
+
"model": "gpt-4-turbo",
|
100 |
+
"messages": [
|
101 |
+
{
|
102 |
+
"role": "user",
|
103 |
+
"content": [
|
104 |
+
{
|
105 |
+
"type": "text",
|
106 |
+
"text": prompt
|
107 |
+
},
|
108 |
+
{
|
109 |
+
"type": "image_url",
|
110 |
+
"image_url": {
|
111 |
+
"url": f"data:image/jpeg;base64,{img_base64}"
|
112 |
+
}
|
113 |
+
}
|
114 |
+
]
|
115 |
+
}
|
116 |
+
],
|
117 |
+
"max_tokens": 300
|
118 |
+
}
|
119 |
+
img_desc_response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
|
120 |
+
if print_out:
|
121 |
+
print(img_desc_response.json()["choices"][0]["message"]["content"])
|
122 |
+
return img_desc_response.json()["choices"][0]["message"]["content"]
|
123 |
+
else:
|
124 |
+
return ValueError
|
125 |
+
|
126 |
+
|
127 |
+
def encode_image_to_base64(image_path):
|
128 |
+
with open(image_path, "rb") as image_file:
|
129 |
+
encoded_string = base64.b64encode(image_file.read()).decode('utf-8')
|
130 |
+
return encoded_string
|
131 |
+
|
132 |
+
|
133 |
+
def mp4_to_mp3(video_file_path, audio_file_path):
|
134 |
+
# Load the video file
|
135 |
+
video = VideoFileClip(video_file_path)
|
136 |
+
|
137 |
+
# Extract audio from the video and write it to an MP3 file
|
138 |
+
video.audio.write_audiofile(audio_file_path)
|
139 |
+
|
140 |
+
# Close the video file to free resources
|
141 |
+
video.close()
|
142 |
+
|
143 |
+
print(f"Converted {video_file_path} to {audio_file_path}")
|
144 |
+
|
145 |
+
|
146 |
+
def mp4_audio_to_mp3(mp4_audio_path, mp3_output_path):
|
147 |
+
# Load the MP4 file
|
148 |
+
audio = AudioSegment.from_file(mp4_audio_path, format="mp4")
|
149 |
+
|
150 |
+
# Export as an MP3 file
|
151 |
+
audio.export(mp3_output_path, format="mp3")
|
152 |
+
|
153 |
+
print(f"Converted {mp4_audio_path} to {mp3_output_path}")
|
154 |
+
|
155 |
+
|
156 |
+
def table_to_text(table=None, prompt="describe this table in plain text. "
|
157 |
+
"be as precise as possible. spare no detail. "
|
158 |
+
"what is in this table?", print_out=True):
|
159 |
+
if table is not None:
|
160 |
+
response = gpt4_new(f"{prompt} TABLE: {table}")
|
161 |
+
if print_out:
|
162 |
+
print(response)
|
163 |
+
return response
|
164 |
+
else:
|
165 |
+
return ValueError
|
166 |
+
|
167 |
+
|
168 |
+
def danja():
|
169 |
+
#mp4_file = "C:\\Users\\eliaw\\Downloads\\WhatsApp Audio 2024-05-10 at 22.17.12.mp4"
|
170 |
+
|
171 |
+
#mp3_file = "output_audio.mp3"
|
172 |
+
mp3_file = "C:\\Users\\eliaw\\Downloads\\WhatsApp Audio 2024-05-10 at 22.17.12.mp3"
|
173 |
+
|
174 |
+
# mp4_audio_to_mp3(mp4_file, mp3_file)
|
175 |
+
|
176 |
+
# Usage example
|
177 |
+
# mp4_to_mp3(mp4_file, mp3_file)
|
178 |
+
|
179 |
+
audio_file = open(mp3_file, "rb")
|
180 |
+
transcription = client.audio.transcriptions.create(
|
181 |
+
model="whisper-1",
|
182 |
+
file=audio_file
|
183 |
+
)
|
184 |
+
print(transcription.text)
|
185 |
+
|
186 |
+
|
187 |
+
if __name__ == "__main__":
|
188 |
+
#print("here are all functions that directly call openai.")
|
189 |
+
#img_create("a skier in the swiss alps", download_path="skier.png")
|
190 |
+
#img_to_text(img_base64=encode_image_to_base64("skier.png"))
|
191 |
+
#print(image_to_base64("skier.png"))
|
192 |
+
#print(vectorize_data("test string"))
|
193 |
+
|
194 |
+
print(gpt4_new())
|
195 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit~=1.33.0
|
2 |
+
bcrypt~=4.1.2
|
3 |
+
psycopg2-binary~=2.9.9
|
4 |
+
openai~=1.23.2
|
5 |
+
pypdf2~=3.0.1
|
6 |
+
langchain~=0.1.16
|
7 |
+
tiktoken~=0.6.0
|
8 |
+
numpy~=1.26.4
|
9 |
+
requests~=2.31.0
|
10 |
+
faiss-cpu
|
setup_db.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
import openpyxl
|
3 |
+
import my_new_openai
|
4 |
+
|
5 |
+
|
6 |
+
def update_excel_with_sums(filename):
|
7 |
+
# Load the workbook and select the active worksheet
|
8 |
+
workbook = openpyxl.load_workbook(filename)
|
9 |
+
sheet = workbook.active
|
10 |
+
|
11 |
+
# Iterate through each row in the sheet
|
12 |
+
for row in sheet.iter_rows(min_row=1, min_col=2, max_col=3):
|
13 |
+
Bn, Cn = row # Assuming B and C are columns 2 and 3 respectively
|
14 |
+
vector = my_new_openai.vectorize_data(f"{Bn.value}: {Cn.value}") if Bn.value and Cn.value else 0
|
15 |
+
if vector != 0:
|
16 |
+
for val in vector:
|
17 |
+
sheet.cell(row=Bn.row, column=4+vector.index(val)).value = val
|
18 |
+
|
19 |
+
# Save the workbook
|
20 |
+
workbook.save(filename)
|
21 |
+
print(f"Updated the file '{filename}' with vectors in column D.")
|
22 |
+
|
23 |
+
|
24 |
+
def load_vectorstore_from_excel(filename):
|
25 |
+
# returns a dictonary
|
26 |
+
# Load the workbook and select the active worksheet
|
27 |
+
workbook = openpyxl.load_workbook(filename)
|
28 |
+
sheet = workbook.active
|
29 |
+
|
30 |
+
# Iterate through each row in the sheet
|
31 |
+
vec_store = {}
|
32 |
+
for row in range(3, 634):
|
33 |
+
vec = []
|
34 |
+
for col in range(0, 1536):
|
35 |
+
val = sheet.cell(row=row, column=4+col).value
|
36 |
+
vec.append(val)
|
37 |
+
vec_store[str(sheet.cell(row=row, column=1).value)] = vec
|
38 |
+
return vec_store
|
39 |
+
|
40 |
+
|
41 |
+
if __name__ == '__main__':
|
42 |
+
#update_excel_with_sums("KBOB_Klassifizierung.xlsx")
|
43 |
+
t = time.time()
|
44 |
+
|
45 |
+
vec_store = load_vectorstore_from_excel("KBOB_Klassifizierung.xlsx")
|
46 |
+
|
47 |
+
print(time.time()-t)
|
48 |
+
for e in vec_store.keys():
|
49 |
+
print(f"{e}: {vec_store[e][0]}, {vec_store[e][1]}, .... {vec_store[e][-1]}")
|
50 |
+
|