Spaces:
Runtime error
Runtime error
Commit
·
0c4e36a
1
Parent(s):
6bccf6f
structure
Browse files- .gitignore +0 -2
- LICENSE +201 -201
- app.py +2 -6
- KBOB_Klassifizierung.xlsx → data/KBOB_Klassifizierung.xlsx +0 -0
- requirements.txt +15 -15
- ask_app.py → util/ask_app.py +243 -243
- classify_app.py → util/classify_app.py +197 -197
- ingest.py → util/ingest.py +126 -126
- my_1_reader.py → util/my_1_reader.py +201 -201
- my_1_writer.py → util/my_1_writer.py +0 -0
- my_2_sim_search.py → util/my_2_sim_search.py +163 -163
- my_new_openai.py → util/my_new_openai.py +151 -151
- my_vectors.py → util/my_vectors.py +0 -0
- setup_db.py → util/setup_db.py +50 -50
.gitignore
CHANGED
@@ -120,9 +120,7 @@ celerybeat.pid
|
|
120 |
|
121 |
# Environments
|
122 |
.env
|
123 |
-
.venv
|
124 |
env/
|
125 |
-
venv/
|
126 |
ENV/
|
127 |
env.bak/
|
128 |
venv.bak/
|
|
|
120 |
|
121 |
# Environments
|
122 |
.env
|
|
|
123 |
env/
|
|
|
124 |
ENV/
|
125 |
env.bak/
|
126 |
venv.bak/
|
LICENSE
CHANGED
@@ -1,201 +1,201 @@
|
|
1 |
-
Apache License
|
2 |
-
Version 2.0, January 2004
|
3 |
-
http://www.apache.org/licenses/
|
4 |
-
|
5 |
-
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
6 |
-
|
7 |
-
1. Definitions.
|
8 |
-
|
9 |
-
"License" shall mean the terms and conditions for use, reproduction,
|
10 |
-
and distribution as defined by Sections 1 through 9 of this document.
|
11 |
-
|
12 |
-
"Licensor" shall mean the copyright owner or entity authorized by
|
13 |
-
the copyright owner that is granting the License.
|
14 |
-
|
15 |
-
"Legal Entity" shall mean the union of the acting entity and all
|
16 |
-
other entities that control, are controlled by, or are under common
|
17 |
-
control with that entity. For the purposes of this definition,
|
18 |
-
"control" means (i) the power, direct or indirect, to cause the
|
19 |
-
direction or management of such entity, whether by contract or
|
20 |
-
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
21 |
-
outstanding shares, or (iii) beneficial ownership of such entity.
|
22 |
-
|
23 |
-
"You" (or "Your") shall mean an individual or Legal Entity
|
24 |
-
exercising permissions granted by this License.
|
25 |
-
|
26 |
-
"Source" form shall mean the preferred form for making modifications,
|
27 |
-
including but not limited to software source code, documentation
|
28 |
-
source, and configuration files.
|
29 |
-
|
30 |
-
"Object" form shall mean any form resulting from mechanical
|
31 |
-
transformation or translation of a Source form, including but
|
32 |
-
not limited to compiled object code, generated documentation,
|
33 |
-
and conversions to other media types.
|
34 |
-
|
35 |
-
"Work" shall mean the work of authorship, whether in Source or
|
36 |
-
Object form, made available under the License, as indicated by a
|
37 |
-
copyright notice that is included in or attached to the work
|
38 |
-
(an example is provided in the Appendix below).
|
39 |
-
|
40 |
-
"Derivative Works" shall mean any work, whether in Source or Object
|
41 |
-
form, that is based on (or derived from) the Work and for which the
|
42 |
-
editorial revisions, annotations, elaborations, or other modifications
|
43 |
-
represent, as a whole, an original work of authorship. For the purposes
|
44 |
-
of this License, Derivative Works shall not include works that remain
|
45 |
-
separable from, or merely link (or bind by name) to the interfaces of,
|
46 |
-
the Work and Derivative Works thereof.
|
47 |
-
|
48 |
-
"Contribution" shall mean any work of authorship, including
|
49 |
-
the original version of the Work and any modifications or additions
|
50 |
-
to that Work or Derivative Works thereof, that is intentionally
|
51 |
-
submitted to Licensor for inclusion in the Work by the copyright owner
|
52 |
-
or by an individual or Legal Entity authorized to submit on behalf of
|
53 |
-
the copyright owner. For the purposes of this definition, "submitted"
|
54 |
-
means any form of electronic, verbal, or written communication sent
|
55 |
-
to the Licensor or its representatives, including but not limited to
|
56 |
-
communication on electronic mailing lists, source code control systems,
|
57 |
-
and issue tracking systems that are managed by, or on behalf of, the
|
58 |
-
Licensor for the purpose of discussing and improving the Work, but
|
59 |
-
excluding communication that is conspicuously marked or otherwise
|
60 |
-
designated in writing by the copyright owner as "Not a Contribution."
|
61 |
-
|
62 |
-
"Contributor" shall mean Licensor and any individual or Legal Entity
|
63 |
-
on behalf of whom a Contribution has been received by Licensor and
|
64 |
-
subsequently incorporated within the Work.
|
65 |
-
|
66 |
-
2. Grant of Copyright License. Subject to the terms and conditions of
|
67 |
-
this License, each Contributor hereby grants to You a perpetual,
|
68 |
-
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
69 |
-
copyright license to reproduce, prepare Derivative Works of,
|
70 |
-
publicly display, publicly perform, sublicense, and distribute the
|
71 |
-
Work and such Derivative Works in Source or Object form.
|
72 |
-
|
73 |
-
3. Grant of Patent License. Subject to the terms and conditions of
|
74 |
-
this License, each Contributor hereby grants to You a perpetual,
|
75 |
-
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
76 |
-
(except as stated in this section) patent license to make, have made,
|
77 |
-
use, offer to sell, sell, import, and otherwise transfer the Work,
|
78 |
-
where such license applies only to those patent claims licensable
|
79 |
-
by such Contributor that are necessarily infringed by their
|
80 |
-
Contribution(s) alone or by combination of their Contribution(s)
|
81 |
-
with the Work to which such Contribution(s) was submitted. If You
|
82 |
-
institute patent litigation against any entity (including a
|
83 |
-
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
84 |
-
or a Contribution incorporated within the Work constitutes direct
|
85 |
-
or contributory patent infringement, then any patent licenses
|
86 |
-
granted to You under this License for that Work shall terminate
|
87 |
-
as of the date such litigation is filed.
|
88 |
-
|
89 |
-
4. Redistribution. You may reproduce and distribute copies of the
|
90 |
-
Work or Derivative Works thereof in any medium, with or without
|
91 |
-
modifications, and in Source or Object form, provided that You
|
92 |
-
meet the following conditions:
|
93 |
-
|
94 |
-
(a) You must give any other recipients of the Work or
|
95 |
-
Derivative Works a copy of this License; and
|
96 |
-
|
97 |
-
(b) You must cause any modified files to carry prominent notices
|
98 |
-
stating that You changed the files; and
|
99 |
-
|
100 |
-
(c) You must retain, in the Source form of any Derivative Works
|
101 |
-
that You distribute, all copyright, patent, trademark, and
|
102 |
-
attribution notices from the Source form of the Work,
|
103 |
-
excluding those notices that do not pertain to any part of
|
104 |
-
the Derivative Works; and
|
105 |
-
|
106 |
-
(d) If the Work includes a "NOTICE" text file as part of its
|
107 |
-
distribution, then any Derivative Works that You distribute must
|
108 |
-
include a readable copy of the attribution notices contained
|
109 |
-
within such NOTICE file, excluding those notices that do not
|
110 |
-
pertain to any part of the Derivative Works, in at least one
|
111 |
-
of the following places: within a NOTICE text file distributed
|
112 |
-
as part of the Derivative Works; within the Source form or
|
113 |
-
documentation, if provided along with the Derivative Works; or,
|
114 |
-
within a display generated by the Derivative Works, if and
|
115 |
-
wherever such third-party notices normally appear. The contents
|
116 |
-
of the NOTICE file are for informational purposes only and
|
117 |
-
do not modify the License. You may add Your own attribution
|
118 |
-
notices within Derivative Works that You distribute, alongside
|
119 |
-
or as an addendum to the NOTICE text from the Work, provided
|
120 |
-
that such additional attribution notices cannot be construed
|
121 |
-
as modifying the License.
|
122 |
-
|
123 |
-
You may add Your own copyright statement to Your modifications and
|
124 |
-
may provide additional or different license terms and conditions
|
125 |
-
for use, reproduction, or distribution of Your modifications, or
|
126 |
-
for any such Derivative Works as a whole, provided Your use,
|
127 |
-
reproduction, and distribution of the Work otherwise complies with
|
128 |
-
the conditions stated in this License.
|
129 |
-
|
130 |
-
5. Submission of Contributions. Unless You explicitly state otherwise,
|
131 |
-
any Contribution intentionally submitted for inclusion in the Work
|
132 |
-
by You to the Licensor shall be under the terms and conditions of
|
133 |
-
this License, without any additional terms or conditions.
|
134 |
-
Notwithstanding the above, nothing herein shall supersede or modify
|
135 |
-
the terms of any separate license agreement you may have executed
|
136 |
-
with Licensor regarding such Contributions.
|
137 |
-
|
138 |
-
6. Trademarks. This License does not grant permission to use the trade
|
139 |
-
names, trademarks, service marks, or product names of the Licensor,
|
140 |
-
except as required for reasonable and customary use in describing the
|
141 |
-
origin of the Work and reproducing the content of the NOTICE file.
|
142 |
-
|
143 |
-
7. Disclaimer of Warranty. Unless required by applicable law or
|
144 |
-
agreed to in writing, Licensor provides the Work (and each
|
145 |
-
Contributor provides its Contributions) on an "AS IS" BASIS,
|
146 |
-
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
147 |
-
implied, including, without limitation, any warranties or conditions
|
148 |
-
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
149 |
-
PARTICULAR PURPOSE. You are solely responsible for determining the
|
150 |
-
appropriateness of using or redistributing the Work and assume any
|
151 |
-
risks associated with Your exercise of permissions under this License.
|
152 |
-
|
153 |
-
8. Limitation of Liability. In no event and under no legal theory,
|
154 |
-
whether in tort (including negligence), contract, or otherwise,
|
155 |
-
unless required by applicable law (such as deliberate and grossly
|
156 |
-
negligent acts) or agreed to in writing, shall any Contributor be
|
157 |
-
liable to You for damages, including any direct, indirect, special,
|
158 |
-
incidental, or consequential damages of any character arising as a
|
159 |
-
result of this License or out of the use or inability to use the
|
160 |
-
Work (including but not limited to damages for loss of goodwill,
|
161 |
-
work stoppage, computer failure or malfunction, or any and all
|
162 |
-
other commercial damages or losses), even if such Contributor
|
163 |
-
has been advised of the possibility of such damages.
|
164 |
-
|
165 |
-
9. Accepting Warranty or Additional Liability. While redistributing
|
166 |
-
the Work or Derivative Works thereof, You may choose to offer,
|
167 |
-
and charge a fee for, acceptance of support, warranty, indemnity,
|
168 |
-
or other liability obligations and/or rights consistent with this
|
169 |
-
License. However, in accepting such obligations, You may act only
|
170 |
-
on Your own behalf and on Your sole responsibility, not on behalf
|
171 |
-
of any other Contributor, and only if You agree to indemnify,
|
172 |
-
defend, and hold each Contributor harmless for any liability
|
173 |
-
incurred by, or claims asserted against, such Contributor by reason
|
174 |
-
of your accepting any such warranty or additional liability.
|
175 |
-
|
176 |
-
END OF TERMS AND CONDITIONS
|
177 |
-
|
178 |
-
APPENDIX: How to apply the Apache License to your work.
|
179 |
-
|
180 |
-
To apply the Apache License to your work, attach the following
|
181 |
-
boilerplate notice, with the fields enclosed by brackets "[]"
|
182 |
-
replaced with your own identifying information. (Don't include
|
183 |
-
the brackets!) The text should be enclosed in the appropriate
|
184 |
-
comment syntax for the file format. We also recommend that a
|
185 |
-
file or class name and description of purpose be included on the
|
186 |
-
same "printed page" as the copyright notice for easier
|
187 |
-
identification within third-party archives.
|
188 |
-
|
189 |
-
Copyright [yyyy] [name of copyright owner]
|
190 |
-
|
191 |
-
Licensed under the Apache License, Version 2.0 (the "License");
|
192 |
-
you may not use this file except in compliance with the License.
|
193 |
-
You may obtain a copy of the License at
|
194 |
-
|
195 |
-
http://www.apache.org/licenses/LICENSE-2.0
|
196 |
-
|
197 |
-
Unless required by applicable law or agreed to in writing, software
|
198 |
-
distributed under the License is distributed on an "AS IS" BASIS,
|
199 |
-
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
200 |
-
See the License for the specific language governing permissions and
|
201 |
-
limitations under the License.
|
|
|
1 |
+
Apache License
|
2 |
+
Version 2.0, January 2004
|
3 |
+
http://www.apache.org/licenses/
|
4 |
+
|
5 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
6 |
+
|
7 |
+
1. Definitions.
|
8 |
+
|
9 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
10 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
11 |
+
|
12 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
13 |
+
the copyright owner that is granting the License.
|
14 |
+
|
15 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
16 |
+
other entities that control, are controlled by, or are under common
|
17 |
+
control with that entity. For the purposes of this definition,
|
18 |
+
"control" means (i) the power, direct or indirect, to cause the
|
19 |
+
direction or management of such entity, whether by contract or
|
20 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
21 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
22 |
+
|
23 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
24 |
+
exercising permissions granted by this License.
|
25 |
+
|
26 |
+
"Source" form shall mean the preferred form for making modifications,
|
27 |
+
including but not limited to software source code, documentation
|
28 |
+
source, and configuration files.
|
29 |
+
|
30 |
+
"Object" form shall mean any form resulting from mechanical
|
31 |
+
transformation or translation of a Source form, including but
|
32 |
+
not limited to compiled object code, generated documentation,
|
33 |
+
and conversions to other media types.
|
34 |
+
|
35 |
+
"Work" shall mean the work of authorship, whether in Source or
|
36 |
+
Object form, made available under the License, as indicated by a
|
37 |
+
copyright notice that is included in or attached to the work
|
38 |
+
(an example is provided in the Appendix below).
|
39 |
+
|
40 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
41 |
+
form, that is based on (or derived from) the Work and for which the
|
42 |
+
editorial revisions, annotations, elaborations, or other modifications
|
43 |
+
represent, as a whole, an original work of authorship. For the purposes
|
44 |
+
of this License, Derivative Works shall not include works that remain
|
45 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
46 |
+
the Work and Derivative Works thereof.
|
47 |
+
|
48 |
+
"Contribution" shall mean any work of authorship, including
|
49 |
+
the original version of the Work and any modifications or additions
|
50 |
+
to that Work or Derivative Works thereof, that is intentionally
|
51 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
52 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
53 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
54 |
+
means any form of electronic, verbal, or written communication sent
|
55 |
+
to the Licensor or its representatives, including but not limited to
|
56 |
+
communication on electronic mailing lists, source code control systems,
|
57 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
58 |
+
Licensor for the purpose of discussing and improving the Work, but
|
59 |
+
excluding communication that is conspicuously marked or otherwise
|
60 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
61 |
+
|
62 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
63 |
+
on behalf of whom a Contribution has been received by Licensor and
|
64 |
+
subsequently incorporated within the Work.
|
65 |
+
|
66 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
67 |
+
this License, each Contributor hereby grants to You a perpetual,
|
68 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
69 |
+
copyright license to reproduce, prepare Derivative Works of,
|
70 |
+
publicly display, publicly perform, sublicense, and distribute the
|
71 |
+
Work and such Derivative Works in Source or Object form.
|
72 |
+
|
73 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
74 |
+
this License, each Contributor hereby grants to You a perpetual,
|
75 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
76 |
+
(except as stated in this section) patent license to make, have made,
|
77 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
78 |
+
where such license applies only to those patent claims licensable
|
79 |
+
by such Contributor that are necessarily infringed by their
|
80 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
81 |
+
with the Work to which such Contribution(s) was submitted. If You
|
82 |
+
institute patent litigation against any entity (including a
|
83 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
84 |
+
or a Contribution incorporated within the Work constitutes direct
|
85 |
+
or contributory patent infringement, then any patent licenses
|
86 |
+
granted to You under this License for that Work shall terminate
|
87 |
+
as of the date such litigation is filed.
|
88 |
+
|
89 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
90 |
+
Work or Derivative Works thereof in any medium, with or without
|
91 |
+
modifications, and in Source or Object form, provided that You
|
92 |
+
meet the following conditions:
|
93 |
+
|
94 |
+
(a) You must give any other recipients of the Work or
|
95 |
+
Derivative Works a copy of this License; and
|
96 |
+
|
97 |
+
(b) You must cause any modified files to carry prominent notices
|
98 |
+
stating that You changed the files; and
|
99 |
+
|
100 |
+
(c) You must retain, in the Source form of any Derivative Works
|
101 |
+
that You distribute, all copyright, patent, trademark, and
|
102 |
+
attribution notices from the Source form of the Work,
|
103 |
+
excluding those notices that do not pertain to any part of
|
104 |
+
the Derivative Works; and
|
105 |
+
|
106 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
107 |
+
distribution, then any Derivative Works that You distribute must
|
108 |
+
include a readable copy of the attribution notices contained
|
109 |
+
within such NOTICE file, excluding those notices that do not
|
110 |
+
pertain to any part of the Derivative Works, in at least one
|
111 |
+
of the following places: within a NOTICE text file distributed
|
112 |
+
as part of the Derivative Works; within the Source form or
|
113 |
+
documentation, if provided along with the Derivative Works; or,
|
114 |
+
within a display generated by the Derivative Works, if and
|
115 |
+
wherever such third-party notices normally appear. The contents
|
116 |
+
of the NOTICE file are for informational purposes only and
|
117 |
+
do not modify the License. You may add Your own attribution
|
118 |
+
notices within Derivative Works that You distribute, alongside
|
119 |
+
or as an addendum to the NOTICE text from the Work, provided
|
120 |
+
that such additional attribution notices cannot be construed
|
121 |
+
as modifying the License.
|
122 |
+
|
123 |
+
You may add Your own copyright statement to Your modifications and
|
124 |
+
may provide additional or different license terms and conditions
|
125 |
+
for use, reproduction, or distribution of Your modifications, or
|
126 |
+
for any such Derivative Works as a whole, provided Your use,
|
127 |
+
reproduction, and distribution of the Work otherwise complies with
|
128 |
+
the conditions stated in this License.
|
129 |
+
|
130 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
131 |
+
any Contribution intentionally submitted for inclusion in the Work
|
132 |
+
by You to the Licensor shall be under the terms and conditions of
|
133 |
+
this License, without any additional terms or conditions.
|
134 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
135 |
+
the terms of any separate license agreement you may have executed
|
136 |
+
with Licensor regarding such Contributions.
|
137 |
+
|
138 |
+
6. Trademarks. This License does not grant permission to use the trade
|
139 |
+
names, trademarks, service marks, or product names of the Licensor,
|
140 |
+
except as required for reasonable and customary use in describing the
|
141 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
142 |
+
|
143 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
144 |
+
agreed to in writing, Licensor provides the Work (and each
|
145 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
146 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
147 |
+
implied, including, without limitation, any warranties or conditions
|
148 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
149 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
150 |
+
appropriateness of using or redistributing the Work and assume any
|
151 |
+
risks associated with Your exercise of permissions under this License.
|
152 |
+
|
153 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
154 |
+
whether in tort (including negligence), contract, or otherwise,
|
155 |
+
unless required by applicable law (such as deliberate and grossly
|
156 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
157 |
+
liable to You for damages, including any direct, indirect, special,
|
158 |
+
incidental, or consequential damages of any character arising as a
|
159 |
+
result of this License or out of the use or inability to use the
|
160 |
+
Work (including but not limited to damages for loss of goodwill,
|
161 |
+
work stoppage, computer failure or malfunction, or any and all
|
162 |
+
other commercial damages or losses), even if such Contributor
|
163 |
+
has been advised of the possibility of such damages.
|
164 |
+
|
165 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
166 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
167 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
168 |
+
or other liability obligations and/or rights consistent with this
|
169 |
+
License. However, in accepting such obligations, You may act only
|
170 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
171 |
+
of any other Contributor, and only if You agree to indemnify,
|
172 |
+
defend, and hold each Contributor harmless for any liability
|
173 |
+
incurred by, or claims asserted against, such Contributor by reason
|
174 |
+
of your accepting any such warranty or additional liability.
|
175 |
+
|
176 |
+
END OF TERMS AND CONDITIONS
|
177 |
+
|
178 |
+
APPENDIX: How to apply the Apache License to your work.
|
179 |
+
|
180 |
+
To apply the Apache License to your work, attach the following
|
181 |
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
182 |
+
replaced with your own identifying information. (Don't include
|
183 |
+
the brackets!) The text should be enclosed in the appropriate
|
184 |
+
comment syntax for the file format. We also recommend that a
|
185 |
+
file or class name and description of purpose be included on the
|
186 |
+
same "printed page" as the copyright notice for easier
|
187 |
+
identification within third-party archives.
|
188 |
+
|
189 |
+
Copyright [yyyy] [name of copyright owner]
|
190 |
+
|
191 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
192 |
+
you may not use this file except in compliance with the License.
|
193 |
+
You may obtain a copy of the License at
|
194 |
+
|
195 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
196 |
+
|
197 |
+
Unless required by applicable law or agreed to in writing, software
|
198 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
199 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
200 |
+
See the License for the specific language governing permissions and
|
201 |
+
limitations under the License.
|
app.py
CHANGED
@@ -1,14 +1,10 @@
|
|
1 |
"""
|
2 |
testing my own vectors
|
3 |
"""
|
4 |
-
import ingest
|
5 |
-
import my_2_sim_search
|
6 |
-
import my_new_openai
|
7 |
-
import setup_db
|
8 |
import time
|
9 |
import streamlit as st
|
10 |
import os
|
11 |
-
import my_vectors
|
12 |
|
13 |
|
14 |
def merge_indices(index1, index2):
|
@@ -148,7 +144,7 @@ def main():
|
|
148 |
st.warning("unsaved embeddings will be lost.")
|
149 |
else:
|
150 |
file = st.file_uploader("upload file", accept_multiple_files=False)
|
151 |
-
vec_store = setup_db.load_vectorstore_from_excel("KBOB_Klassifizierung.xlsx")
|
152 |
if st.button("classify me!"):
|
153 |
with st.spinner("Classifying..."):
|
154 |
query_vecs = []
|
|
|
1 |
"""
|
2 |
testing my own vectors
|
3 |
"""
|
4 |
+
from util import setup_db, my_vectors, my_2_sim_search, ingest, my_new_openai
|
|
|
|
|
|
|
5 |
import time
|
6 |
import streamlit as st
|
7 |
import os
|
|
|
8 |
|
9 |
|
10 |
def merge_indices(index1, index2):
|
|
|
144 |
st.warning("unsaved embeddings will be lost.")
|
145 |
else:
|
146 |
file = st.file_uploader("upload file", accept_multiple_files=False)
|
147 |
+
vec_store = setup_db.load_vectorstore_from_excel("data/KBOB_Klassifizierung.xlsx")
|
148 |
if st.button("classify me!"):
|
149 |
with st.spinner("Classifying..."):
|
150 |
query_vecs = []
|
KBOB_Klassifizierung.xlsx → data/KBOB_Klassifizierung.xlsx
RENAMED
File without changes
|
requirements.txt
CHANGED
@@ -1,16 +1,16 @@
|
|
1 |
-
streamlit~=1.33.0
|
2 |
-
bcrypt~=4.1.2
|
3 |
-
psycopg2-binary~=2.9.9
|
4 |
-
openai~=1.23.2
|
5 |
-
pypdf2~=3.0.1
|
6 |
-
langchain~=0.1.16
|
7 |
-
tiktoken~=0.6.0
|
8 |
-
numpy~=1.26.4
|
9 |
-
requests~=2.31.0
|
10 |
-
pandas~=2.2.2
|
11 |
-
tabula~=1.0.5
|
12 |
-
pdfplumber~=0.11.0
|
13 |
-
PyMuPDF~=1.24.3
|
14 |
-
fitz~=0.0.1.dev2
|
15 |
-
pillow~=10.3.0
|
16 |
openpyxl~=3.1.2
|
|
|
1 |
+
streamlit~=1.33.0
|
2 |
+
bcrypt~=4.1.2
|
3 |
+
psycopg2-binary~=2.9.9
|
4 |
+
openai~=1.23.2
|
5 |
+
pypdf2~=3.0.1
|
6 |
+
langchain~=0.1.16
|
7 |
+
tiktoken~=0.6.0
|
8 |
+
numpy~=1.26.4
|
9 |
+
requests~=2.31.0
|
10 |
+
pandas~=2.2.2
|
11 |
+
tabula~=1.0.5
|
12 |
+
pdfplumber~=0.11.0
|
13 |
+
PyMuPDF~=1.24.3
|
14 |
+
fitz~=0.0.1.dev2
|
15 |
+
pillow~=10.3.0
|
16 |
openpyxl~=3.1.2
|
ask_app.py → util/ask_app.py
RENAMED
@@ -1,243 +1,243 @@
|
|
1 |
-
"""
|
2 |
-
complete, functional RAG App
|
3 |
-
stores vectors in session state, or locally.
|
4 |
-
add function to display retrieved documents
|
5 |
-
"""
|
6 |
-
|
7 |
-
# import time
|
8 |
-
from datetime import datetime
|
9 |
-
# import openai
|
10 |
-
# import tiktoken
|
11 |
-
import streamlit as st
|
12 |
-
from PyPDF2 import PdfReader
|
13 |
-
from langchain.text_splitter import CharacterTextSplitter
|
14 |
-
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
|
15 |
-
from langchain.vectorstores import FAISS
|
16 |
-
from langchain.chat_models import ChatOpenAI
|
17 |
-
from langchain.memory import ConversationBufferMemory
|
18 |
-
from langchain.chains import ConversationalRetrievalChain
|
19 |
-
from html_templates import css, bot_template, user_template
|
20 |
-
from langchain.llms import HuggingFaceHub
|
21 |
-
import os
|
22 |
-
import numpy as np
|
23 |
-
import faiss_utils
|
24 |
-
from langchain_community.vectorstores import FAISS
|
25 |
-
from langchain.embeddings import OpenAIEmbeddings
|
26 |
-
|
27 |
-
|
28 |
-
def merge_faiss_indices(index1, index2):
|
29 |
-
"""
|
30 |
-
Merge two FAISS indices into a new index, assuming both are of the same type and dimensionality.
|
31 |
-
|
32 |
-
Args:
|
33 |
-
index1 (faiss.Index): The first FAISS index.
|
34 |
-
index2 (faiss.Index): The second FAISS index.
|
35 |
-
|
36 |
-
Returns:
|
37 |
-
faiss.Index: A new FAISS index containing all vectors from index1 and index2.
|
38 |
-
"""
|
39 |
-
|
40 |
-
# Check if both indices are the same type
|
41 |
-
if type(index1) != type(index2):
|
42 |
-
raise ValueError("Indices are of different types")
|
43 |
-
|
44 |
-
# Check dimensionality
|
45 |
-
if index1.d != index2.d:
|
46 |
-
raise ValueError("Indices have different dimensionality")
|
47 |
-
|
48 |
-
# Determine type of indices
|
49 |
-
if isinstance(index1, FAISS.IndexFlatL2):
|
50 |
-
# Handle simple flat indices
|
51 |
-
d = index1.d
|
52 |
-
# Extract vectors from both indices
|
53 |
-
xb1 = FAISS.rev_swig_ptr(index1.xb.data(), index1.ntotal * d)
|
54 |
-
xb2 = FAISS.rev_swig_ptr(index2.xb.data(), index2.ntotal * d)
|
55 |
-
|
56 |
-
# Combine vectors
|
57 |
-
xb_combined = np.vstack((xb1, xb2))
|
58 |
-
|
59 |
-
# Create a new index and add combined vectors
|
60 |
-
new_index = FAISS.IndexFlatL2(d)
|
61 |
-
new_index.add(xb_combined)
|
62 |
-
return new_index
|
63 |
-
|
64 |
-
elif isinstance(index1, FAISS.IndexIVFFlat):
|
65 |
-
# Handle quantized indices (IndexIVFFlat)
|
66 |
-
d = index1.d
|
67 |
-
nlist = index1.nlist
|
68 |
-
quantizer = FAISS.IndexFlatL2(d) # Re-create the appropriate quantizer
|
69 |
-
|
70 |
-
# Create a new index with the same configuration
|
71 |
-
new_index = FAISS.IndexIVFFlat(quantizer, d, nlist, FAISS.METRIC_L2)
|
72 |
-
|
73 |
-
# If the indices are already trained, you can directly add the vectors
|
74 |
-
# Otherwise, you may need to train new_index using a representative subset of vectors
|
75 |
-
vecs1 = FAISS.rev_swig_ptr(index1.xb.data(), index1.ntotal * d)
|
76 |
-
vecs2 = FAISS.rev_swig_ptr(index2.xb.data(), index2.ntotal * d)
|
77 |
-
new_index.add(vecs1)
|
78 |
-
new_index.add(vecs2)
|
79 |
-
return new_index
|
80 |
-
|
81 |
-
else:
|
82 |
-
raise TypeError("Index type not supported for merging in this function")
|
83 |
-
|
84 |
-
|
85 |
-
def get_pdf_text(pdf_docs):
|
86 |
-
text = ""
|
87 |
-
for pdf in pdf_docs:
|
88 |
-
pdf_reader = PdfReader(pdf)
|
89 |
-
for page in pdf_reader.pages:
|
90 |
-
text += page.extract_text()
|
91 |
-
return text
|
92 |
-
|
93 |
-
|
94 |
-
def get_text_chunks(text):
|
95 |
-
text_splitter = CharacterTextSplitter(
|
96 |
-
separator="\n",
|
97 |
-
chunk_size=1000,
|
98 |
-
chunk_overlap=200,
|
99 |
-
length_function=len
|
100 |
-
)
|
101 |
-
chunks = text_splitter.split_text(text)
|
102 |
-
return chunks
|
103 |
-
|
104 |
-
|
105 |
-
def get_faiss_vectorstore(text_chunks):
|
106 |
-
if sst.openai:
|
107 |
-
my_embeddings = OpenAIEmbeddings()
|
108 |
-
else:
|
109 |
-
my_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
|
110 |
-
vectorstore = FAISS.from_texts(texts=text_chunks, embedding=my_embeddings)
|
111 |
-
return vectorstore
|
112 |
-
|
113 |
-
|
114 |
-
def get_conversation_chain(vectorstore):
|
115 |
-
if sst.openai:
|
116 |
-
llm = ChatOpenAI()
|
117 |
-
else:
|
118 |
-
llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature": 0.5, "max_length": 512})
|
119 |
-
|
120 |
-
memory = ConversationBufferMemory(
|
121 |
-
memory_key='chat_history', return_messages=True)
|
122 |
-
conversation_chain = ConversationalRetrievalChain.from_llm(
|
123 |
-
llm=llm,
|
124 |
-
retriever=vectorstore.as_retriever(),
|
125 |
-
memory=memory
|
126 |
-
)
|
127 |
-
return conversation_chain
|
128 |
-
|
129 |
-
|
130 |
-
def handle_userinput(user_question):
|
131 |
-
response = sst.conversation({'question': user_question})
|
132 |
-
sst.chat_history = response['chat_history']
|
133 |
-
|
134 |
-
for i, message in enumerate(sst.chat_history):
|
135 |
-
# Display user message
|
136 |
-
if i % 2 == 0:
|
137 |
-
st.write(user_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
|
138 |
-
else:
|
139 |
-
print(message)
|
140 |
-
# Display AI response
|
141 |
-
st.write(bot_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
|
142 |
-
# Display source document information if available in the message
|
143 |
-
if hasattr(message, 'source') and message.source:
|
144 |
-
st.write(f"Source Document: {message.source}", unsafe_allow_html=True)
|
145 |
-
|
146 |
-
|
147 |
-
if True:
|
148 |
-
BASE_URL = "https://api.vectara.io/v1"
|
149 |
-
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
|
150 |
-
OPENAI_ORG_ID = os.environ["OPENAI_ORG_ID"]
|
151 |
-
PINECONE_API_KEY = os.environ["PINECONE_API_KEY_LCBIM"]
|
152 |
-
HUGGINGFACEHUB_API_TOKEN = os.environ["HUGGINGFACEHUB_API_TOKEN"]
|
153 |
-
VECTARA_API_KEY = os.environ["VECTARA_API_KEY"]
|
154 |
-
VECTARA_CUSTOMER_ID = os.environ["VECTARA_CUSTOMER_ID"]
|
155 |
-
headers = {"Authorization": f"Bearer {VECTARA_API_KEY}", "Content-Type": "application/json"}
|
156 |
-
|
157 |
-
|
158 |
-
def main():
|
159 |
-
st.set_page_config(page_title="Anna Seiler Haus KI-Assistent", page_icon=":hospital:")
|
160 |
-
st.write(css, unsafe_allow_html=True)
|
161 |
-
if "conversation" not in sst:
|
162 |
-
sst.conversation = None
|
163 |
-
if "chat_history" not in sst:
|
164 |
-
sst.chat_history = None
|
165 |
-
if "page" not in sst:
|
166 |
-
sst.page = "home"
|
167 |
-
if "openai" not in sst:
|
168 |
-
sst.openai = True
|
169 |
-
if "login" not in sst:
|
170 |
-
sst.login = False
|
171 |
-
if 'submitted_user_query' not in sst:
|
172 |
-
sst.submitted_user_query = ''
|
173 |
-
if 'submitted_user_safe' not in sst:
|
174 |
-
sst.submitted_user_safe = ''
|
175 |
-
if 'submitted_user_load' not in sst:
|
176 |
-
sst.submitted_user_load = ''
|
177 |
-
|
178 |
-
def submit_user_query():
|
179 |
-
sst.submitted_user_query = sst.widget_user_query
|
180 |
-
sst.widget_user_query = ''
|
181 |
-
|
182 |
-
def submit_user_safe():
|
183 |
-
sst.submitted_user_safe = sst.widget_user_safe
|
184 |
-
sst.widget_user_safe = ''
|
185 |
-
if "vectorstore" in sst:
|
186 |
-
# faiss_name = str(datetime.now().strftime("%Y%m%d%H%M%S")) + "faiss_index"
|
187 |
-
faiss_utils.save_local(sst.vectorstore, path=sst.submitted_user_safe)
|
188 |
-
st.sidebar.success("saved")
|
189 |
-
else:
|
190 |
-
st.sidebar.warning("No embeddings to save. Please process documents first.")
|
191 |
-
|
192 |
-
def submit_user_load():
|
193 |
-
sst.submitted_user_load = sst.widget_user_load
|
194 |
-
sst.widget_user_load = ''
|
195 |
-
if os.path.exists(sst.submitted_user_load):
|
196 |
-
new_db = faiss_utils.load_vectorstore(f"{sst.submitted_user_load}/faiss_index.index")
|
197 |
-
if "vectorstore" in sst:
|
198 |
-
if new_db is not None: # Check if this is working
|
199 |
-
sst.vectorstore.merge_from(new_db)
|
200 |
-
sst.conversation = get_conversation_chain(sst.vectorstore)
|
201 |
-
st.sidebar.success("faiss loaded")
|
202 |
-
else:
|
203 |
-
if new_db is not None: # Check if this is working
|
204 |
-
sst.vectorstore = new_db
|
205 |
-
sst.conversation = get_conversation_chain(new_db)
|
206 |
-
st.sidebar.success("faiss loaded")
|
207 |
-
else:
|
208 |
-
st.sidebar.warning("Couldn't load/find embeddings")
|
209 |
-
|
210 |
-
st.header("Anna Seiler Haus KI-Assistent ASH :hospital:")
|
211 |
-
if st.text_input("ASK_ASH_PASSWORD: ", type="password") == ASK_ASH_PASSWORD:
|
212 |
-
|
213 |
-
#user_question = st.text_input("Ask a question about your documents:", key="user_query", on_change=handle_query)
|
214 |
-
st.text_input('Ask a question about your documents:', key='widget_user_query', on_change=submit_user_query)
|
215 |
-
#sst.openai = st.toggle(label="use openai?")
|
216 |
-
|
217 |
-
if sst.submitted_user_query:
|
218 |
-
if "vectorstore" in sst:
|
219 |
-
handle_userinput(sst.submitted_user_query)
|
220 |
-
else:
|
221 |
-
st.warning("no vectorstore loaded.")
|
222 |
-
|
223 |
-
with st.sidebar:
|
224 |
-
st.subheader("Your documents")
|
225 |
-
pdf_docs = st.file_uploader("Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
|
226 |
-
if st.button("Process"):
|
227 |
-
with st.spinner("Processing"):
|
228 |
-
vec = get_faiss_vectorstore(get_text_chunks(get_pdf_text(pdf_docs)))
|
229 |
-
sst.vectorstore = vec
|
230 |
-
sst.conversation = get_conversation_chain(vec)
|
231 |
-
st.success("embedding complete")
|
232 |
-
|
233 |
-
st.text_input('Safe Embeddings to: (copy path of folder)', key='widget_user_safe',
|
234 |
-
on_change=submit_user_safe)
|
235 |
-
|
236 |
-
st.text_input('Load Embeddings from: (copy path of folder)', key='widget_user_load',
|
237 |
-
on_change=submit_user_load)
|
238 |
-
|
239 |
-
|
240 |
-
if __name__ == '__main__':
|
241 |
-
sst = st.session_state
|
242 |
-
ASK_ASH_PASSWORD = os.environ["ASK_ASH_PASSWORD"]
|
243 |
-
main()
|
|
|
1 |
+
"""
|
2 |
+
complete, functional RAG App
|
3 |
+
stores vectors in session state, or locally.
|
4 |
+
add function to display retrieved documents
|
5 |
+
"""
|
6 |
+
|
7 |
+
# import time
|
8 |
+
from datetime import datetime
|
9 |
+
# import openai
|
10 |
+
# import tiktoken
|
11 |
+
import streamlit as st
|
12 |
+
from PyPDF2 import PdfReader
|
13 |
+
from langchain.text_splitter import CharacterTextSplitter
|
14 |
+
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
|
15 |
+
from langchain.vectorstores import FAISS
|
16 |
+
from langchain.chat_models import ChatOpenAI
|
17 |
+
from langchain.memory import ConversationBufferMemory
|
18 |
+
from langchain.chains import ConversationalRetrievalChain
|
19 |
+
from html_templates import css, bot_template, user_template
|
20 |
+
from langchain.llms import HuggingFaceHub
|
21 |
+
import os
|
22 |
+
import numpy as np
|
23 |
+
import faiss_utils
|
24 |
+
from langchain_community.vectorstores import FAISS
|
25 |
+
from langchain.embeddings import OpenAIEmbeddings
|
26 |
+
|
27 |
+
|
28 |
+
def merge_faiss_indices(index1, index2):
|
29 |
+
"""
|
30 |
+
Merge two FAISS indices into a new index, assuming both are of the same type and dimensionality.
|
31 |
+
|
32 |
+
Args:
|
33 |
+
index1 (faiss.Index): The first FAISS index.
|
34 |
+
index2 (faiss.Index): The second FAISS index.
|
35 |
+
|
36 |
+
Returns:
|
37 |
+
faiss.Index: A new FAISS index containing all vectors from index1 and index2.
|
38 |
+
"""
|
39 |
+
|
40 |
+
# Check if both indices are the same type
|
41 |
+
if type(index1) != type(index2):
|
42 |
+
raise ValueError("Indices are of different types")
|
43 |
+
|
44 |
+
# Check dimensionality
|
45 |
+
if index1.d != index2.d:
|
46 |
+
raise ValueError("Indices have different dimensionality")
|
47 |
+
|
48 |
+
# Determine type of indices
|
49 |
+
if isinstance(index1, FAISS.IndexFlatL2):
|
50 |
+
# Handle simple flat indices
|
51 |
+
d = index1.d
|
52 |
+
# Extract vectors from both indices
|
53 |
+
xb1 = FAISS.rev_swig_ptr(index1.xb.data(), index1.ntotal * d)
|
54 |
+
xb2 = FAISS.rev_swig_ptr(index2.xb.data(), index2.ntotal * d)
|
55 |
+
|
56 |
+
# Combine vectors
|
57 |
+
xb_combined = np.vstack((xb1, xb2))
|
58 |
+
|
59 |
+
# Create a new index and add combined vectors
|
60 |
+
new_index = FAISS.IndexFlatL2(d)
|
61 |
+
new_index.add(xb_combined)
|
62 |
+
return new_index
|
63 |
+
|
64 |
+
elif isinstance(index1, FAISS.IndexIVFFlat):
|
65 |
+
# Handle quantized indices (IndexIVFFlat)
|
66 |
+
d = index1.d
|
67 |
+
nlist = index1.nlist
|
68 |
+
quantizer = FAISS.IndexFlatL2(d) # Re-create the appropriate quantizer
|
69 |
+
|
70 |
+
# Create a new index with the same configuration
|
71 |
+
new_index = FAISS.IndexIVFFlat(quantizer, d, nlist, FAISS.METRIC_L2)
|
72 |
+
|
73 |
+
# If the indices are already trained, you can directly add the vectors
|
74 |
+
# Otherwise, you may need to train new_index using a representative subset of vectors
|
75 |
+
vecs1 = FAISS.rev_swig_ptr(index1.xb.data(), index1.ntotal * d)
|
76 |
+
vecs2 = FAISS.rev_swig_ptr(index2.xb.data(), index2.ntotal * d)
|
77 |
+
new_index.add(vecs1)
|
78 |
+
new_index.add(vecs2)
|
79 |
+
return new_index
|
80 |
+
|
81 |
+
else:
|
82 |
+
raise TypeError("Index type not supported for merging in this function")
|
83 |
+
|
84 |
+
|
85 |
+
def get_pdf_text(pdf_docs):
|
86 |
+
text = ""
|
87 |
+
for pdf in pdf_docs:
|
88 |
+
pdf_reader = PdfReader(pdf)
|
89 |
+
for page in pdf_reader.pages:
|
90 |
+
text += page.extract_text()
|
91 |
+
return text
|
92 |
+
|
93 |
+
|
94 |
+
def get_text_chunks(text):
|
95 |
+
text_splitter = CharacterTextSplitter(
|
96 |
+
separator="\n",
|
97 |
+
chunk_size=1000,
|
98 |
+
chunk_overlap=200,
|
99 |
+
length_function=len
|
100 |
+
)
|
101 |
+
chunks = text_splitter.split_text(text)
|
102 |
+
return chunks
|
103 |
+
|
104 |
+
|
105 |
+
def get_faiss_vectorstore(text_chunks):
|
106 |
+
if sst.openai:
|
107 |
+
my_embeddings = OpenAIEmbeddings()
|
108 |
+
else:
|
109 |
+
my_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
|
110 |
+
vectorstore = FAISS.from_texts(texts=text_chunks, embedding=my_embeddings)
|
111 |
+
return vectorstore
|
112 |
+
|
113 |
+
|
114 |
+
def get_conversation_chain(vectorstore):
|
115 |
+
if sst.openai:
|
116 |
+
llm = ChatOpenAI()
|
117 |
+
else:
|
118 |
+
llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature": 0.5, "max_length": 512})
|
119 |
+
|
120 |
+
memory = ConversationBufferMemory(
|
121 |
+
memory_key='chat_history', return_messages=True)
|
122 |
+
conversation_chain = ConversationalRetrievalChain.from_llm(
|
123 |
+
llm=llm,
|
124 |
+
retriever=vectorstore.as_retriever(),
|
125 |
+
memory=memory
|
126 |
+
)
|
127 |
+
return conversation_chain
|
128 |
+
|
129 |
+
|
130 |
+
def handle_userinput(user_question):
|
131 |
+
response = sst.conversation({'question': user_question})
|
132 |
+
sst.chat_history = response['chat_history']
|
133 |
+
|
134 |
+
for i, message in enumerate(sst.chat_history):
|
135 |
+
# Display user message
|
136 |
+
if i % 2 == 0:
|
137 |
+
st.write(user_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
|
138 |
+
else:
|
139 |
+
print(message)
|
140 |
+
# Display AI response
|
141 |
+
st.write(bot_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
|
142 |
+
# Display source document information if available in the message
|
143 |
+
if hasattr(message, 'source') and message.source:
|
144 |
+
st.write(f"Source Document: {message.source}", unsafe_allow_html=True)
|
145 |
+
|
146 |
+
|
147 |
+
if True:
|
148 |
+
BASE_URL = "https://api.vectara.io/v1"
|
149 |
+
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
|
150 |
+
OPENAI_ORG_ID = os.environ["OPENAI_ORG_ID"]
|
151 |
+
PINECONE_API_KEY = os.environ["PINECONE_API_KEY_LCBIM"]
|
152 |
+
HUGGINGFACEHUB_API_TOKEN = os.environ["HUGGINGFACEHUB_API_TOKEN"]
|
153 |
+
VECTARA_API_KEY = os.environ["VECTARA_API_KEY"]
|
154 |
+
VECTARA_CUSTOMER_ID = os.environ["VECTARA_CUSTOMER_ID"]
|
155 |
+
headers = {"Authorization": f"Bearer {VECTARA_API_KEY}", "Content-Type": "application/json"}
|
156 |
+
|
157 |
+
|
158 |
+
def main():
|
159 |
+
st.set_page_config(page_title="Anna Seiler Haus KI-Assistent", page_icon=":hospital:")
|
160 |
+
st.write(css, unsafe_allow_html=True)
|
161 |
+
if "conversation" not in sst:
|
162 |
+
sst.conversation = None
|
163 |
+
if "chat_history" not in sst:
|
164 |
+
sst.chat_history = None
|
165 |
+
if "page" not in sst:
|
166 |
+
sst.page = "home"
|
167 |
+
if "openai" not in sst:
|
168 |
+
sst.openai = True
|
169 |
+
if "login" not in sst:
|
170 |
+
sst.login = False
|
171 |
+
if 'submitted_user_query' not in sst:
|
172 |
+
sst.submitted_user_query = ''
|
173 |
+
if 'submitted_user_safe' not in sst:
|
174 |
+
sst.submitted_user_safe = ''
|
175 |
+
if 'submitted_user_load' not in sst:
|
176 |
+
sst.submitted_user_load = ''
|
177 |
+
|
178 |
+
def submit_user_query():
|
179 |
+
sst.submitted_user_query = sst.widget_user_query
|
180 |
+
sst.widget_user_query = ''
|
181 |
+
|
182 |
+
def submit_user_safe():
|
183 |
+
sst.submitted_user_safe = sst.widget_user_safe
|
184 |
+
sst.widget_user_safe = ''
|
185 |
+
if "vectorstore" in sst:
|
186 |
+
# faiss_name = str(datetime.now().strftime("%Y%m%d%H%M%S")) + "faiss_index"
|
187 |
+
faiss_utils.save_local(sst.vectorstore, path=sst.submitted_user_safe)
|
188 |
+
st.sidebar.success("saved")
|
189 |
+
else:
|
190 |
+
st.sidebar.warning("No embeddings to save. Please process documents first.")
|
191 |
+
|
192 |
+
def submit_user_load():
|
193 |
+
sst.submitted_user_load = sst.widget_user_load
|
194 |
+
sst.widget_user_load = ''
|
195 |
+
if os.path.exists(sst.submitted_user_load):
|
196 |
+
new_db = faiss_utils.load_vectorstore(f"{sst.submitted_user_load}/faiss_index.index")
|
197 |
+
if "vectorstore" in sst:
|
198 |
+
if new_db is not None: # Check if this is working
|
199 |
+
sst.vectorstore.merge_from(new_db)
|
200 |
+
sst.conversation = get_conversation_chain(sst.vectorstore)
|
201 |
+
st.sidebar.success("faiss loaded")
|
202 |
+
else:
|
203 |
+
if new_db is not None: # Check if this is working
|
204 |
+
sst.vectorstore = new_db
|
205 |
+
sst.conversation = get_conversation_chain(new_db)
|
206 |
+
st.sidebar.success("faiss loaded")
|
207 |
+
else:
|
208 |
+
st.sidebar.warning("Couldn't load/find embeddings")
|
209 |
+
|
210 |
+
st.header("Anna Seiler Haus KI-Assistent ASH :hospital:")
|
211 |
+
if st.text_input("ASK_ASH_PASSWORD: ", type="password") == ASK_ASH_PASSWORD:
|
212 |
+
|
213 |
+
#user_question = st.text_input("Ask a question about your documents:", key="user_query", on_change=handle_query)
|
214 |
+
st.text_input('Ask a question about your documents:', key='widget_user_query', on_change=submit_user_query)
|
215 |
+
#sst.openai = st.toggle(label="use openai?")
|
216 |
+
|
217 |
+
if sst.submitted_user_query:
|
218 |
+
if "vectorstore" in sst:
|
219 |
+
handle_userinput(sst.submitted_user_query)
|
220 |
+
else:
|
221 |
+
st.warning("no vectorstore loaded.")
|
222 |
+
|
223 |
+
with st.sidebar:
|
224 |
+
st.subheader("Your documents")
|
225 |
+
pdf_docs = st.file_uploader("Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
|
226 |
+
if st.button("Process"):
|
227 |
+
with st.spinner("Processing"):
|
228 |
+
vec = get_faiss_vectorstore(get_text_chunks(get_pdf_text(pdf_docs)))
|
229 |
+
sst.vectorstore = vec
|
230 |
+
sst.conversation = get_conversation_chain(vec)
|
231 |
+
st.success("embedding complete")
|
232 |
+
|
233 |
+
st.text_input('Safe Embeddings to: (copy path of folder)', key='widget_user_safe',
|
234 |
+
on_change=submit_user_safe)
|
235 |
+
|
236 |
+
st.text_input('Load Embeddings from: (copy path of folder)', key='widget_user_load',
|
237 |
+
on_change=submit_user_load)
|
238 |
+
|
239 |
+
|
240 |
+
if __name__ == '__main__':
|
241 |
+
sst = st.session_state
|
242 |
+
ASK_ASH_PASSWORD = os.environ["ASK_ASH_PASSWORD"]
|
243 |
+
main()
|
classify_app.py → util/classify_app.py
RENAMED
@@ -1,197 +1,197 @@
|
|
1 |
-
import streamlit as st
|
2 |
-
import os
|
3 |
-
# import openai
|
4 |
-
from PyPDF2 import PdfReader
|
5 |
-
from openai import OpenAI
|
6 |
-
from langchain.chat_models import ChatOpenAI
|
7 |
-
|
8 |
-
ASK_ASH_PASSWORD = os.environ["ASK_ASH_PASSWORD"]
|
9 |
-
|
10 |
-
|
11 |
-
def gpt4_new(prompt_text):
|
12 |
-
client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))
|
13 |
-
response = client.chat.completions.create(
|
14 |
-
model="gpt-4",
|
15 |
-
messages=[{"role": "system",
|
16 |
-
"content": "Du bist eine Maschine, auf Grund des Texts von PDF-Dokumenten,"
|
17 |
-
"das Dokument in vorgegebene Kategorien klassifiziert."
|
18 |
-
"Du gibts möglichst kurze Antworten, am besten ein Wort"
|
19 |
-
"Du gibst keine Erklärungen oder Begründungen. "
|
20 |
-
"Du klassifizierst nur nach den vorgegebenen Kategorien."
|
21 |
-
"Wenn ein Dokument partout nicht klassifizierbar ist, "
|
22 |
-
"antwortest du mit '<no classification>'"},
|
23 |
-
{"role": "user", "content": prompt_text}])
|
24 |
-
return response.choices[0].message.content
|
25 |
-
|
26 |
-
|
27 |
-
# Define a function to ask a question to GPT-4
|
28 |
-
def ask_gpt4(question):
|
29 |
-
print(question) # we don't have to submit the question?
|
30 |
-
try:
|
31 |
-
# Use the chat function to send a message and get a response
|
32 |
-
response = ChatOpenAI()
|
33 |
-
# Extract the response text
|
34 |
-
return response["choices"][0]["message"]["content"]
|
35 |
-
except Exception as e:
|
36 |
-
# Handle exceptions that may occur during the API call
|
37 |
-
return str(e)
|
38 |
-
|
39 |
-
|
40 |
-
def process_prompts_and_save(my_prompts):
|
41 |
-
# Ensure the responses list is empty initially
|
42 |
-
responses = []
|
43 |
-
|
44 |
-
# Loop through each prompt in the list
|
45 |
-
for prompt in my_prompts:
|
46 |
-
try:
|
47 |
-
# ADD LOGIC TO READ FILE AND CLASSIFY
|
48 |
-
# Generate response for each prompt and append to the list
|
49 |
-
response = ask_gpt4(prompt)
|
50 |
-
sol = f"{prompt}\n\n{response}\n\n\n\n"
|
51 |
-
print(sol)
|
52 |
-
responses.append(sol)
|
53 |
-
except Exception as e:
|
54 |
-
# In case of an error, log the error with the prompt
|
55 |
-
responses.append(f"{prompt}\n\nError:{str(e)}\n\n\n\n")
|
56 |
-
|
57 |
-
# Writing all responses to a text file
|
58 |
-
with open('gpt4_responses.txt', 'w', encoding='utf-8') as file:
|
59 |
-
file.writelines(responses)
|
60 |
-
|
61 |
-
|
62 |
-
def get_pdfs_text(pdf_docs):
|
63 |
-
text = ""
|
64 |
-
for pdf in pdf_docs:
|
65 |
-
pdf_reader = PdfReader(pdf)
|
66 |
-
for page in pdf_reader.pages:
|
67 |
-
text += page.extract_text()
|
68 |
-
return text
|
69 |
-
|
70 |
-
|
71 |
-
def get_pdf_text(pdf_document):
|
72 |
-
text = ""
|
73 |
-
pdf_reader = PdfReader(pdf_document)
|
74 |
-
for page in pdf_reader.pages:
|
75 |
-
text += page.extract_text()
|
76 |
-
return text
|
77 |
-
|
78 |
-
|
79 |
-
def json_open(filename):
|
80 |
-
with open(filename, "r") as f:
|
81 |
-
mydata = f.read()
|
82 |
-
return mydata
|
83 |
-
|
84 |
-
|
85 |
-
def main():
|
86 |
-
st.title("Doc Classifier")
|
87 |
-
l, r = st.columns(2)
|
88 |
-
if st.toggle("show README"):
|
89 |
-
st.subheader("Funktion: ")
|
90 |
-
st.write("der Doc Classifier von Elia Wäfler kann einige der BIM2FM Dokumente")
|
91 |
-
st.write("des ASH nach Disziplin, Doc typ. und Geschoss (später KBOB) klassifizieren.")
|
92 |
-
st.write("lade ein oder mehrere PDF-Dokumente hoch, um es auszuprobieren.")
|
93 |
-
st.write("Feedback und Bugs gerne an elia.waefler@insel.ch")
|
94 |
-
st.write("Vielen Dank.")
|
95 |
-
st.write("")
|
96 |
-
with l:
|
97 |
-
st.subheader("Limitationen: ")
|
98 |
-
st.write("bisher nur PDFs")
|
99 |
-
st.write("nur Disziplin, Doc typ. und Geschoss")
|
100 |
-
st.write("macht teilweise Fehler, vor allem bei Koordination, Datennetz usw, (unklare Disziplinen)")
|
101 |
-
st.write("")
|
102 |
-
with r:
|
103 |
-
st.subheader("geplante Erweiterungen:")
|
104 |
-
st.write("Text Beschreibung wird von AI hinzugefügt")
|
105 |
-
st.write("jpg, bilder, tabellen, .xlsx, .docx alles möglich, nicht nur PDF/Text")
|
106 |
-
st.write("Ecodomus API einbinden, um alle Dokumente zu überprüfen.")
|
107 |
-
|
108 |
-
if st.text_input("ASK_ASH_PASSWORD: ", type="password") == ASK_ASH_PASSWORD:
|
109 |
-
uploaded_files = st.file_uploader("PDF Dokument", accept_multiple_files=True)
|
110 |
-
#print(uploaded_file)
|
111 |
-
#print(uploaded_file.name)
|
112 |
-
|
113 |
-
if st.button("classify KBOB!"):
|
114 |
-
if uploaded_files is not None:
|
115 |
-
with st.container():
|
116 |
-
# col1, col2, col3, col4, col5 = st.columns(5)
|
117 |
-
col1, col2, col3 = st.columns(3)
|
118 |
-
all_metadata = []
|
119 |
-
with col1:
|
120 |
-
st.write("Disziplin")
|
121 |
-
st.write(f"")
|
122 |
-
with col2:
|
123 |
-
st.write("Dokumententyp")
|
124 |
-
st.write(f"")
|
125 |
-
with col3:
|
126 |
-
st.write("Geschoss")
|
127 |
-
st.write(f"")
|
128 |
-
|
129 |
-
for file in uploaded_files:
|
130 |
-
metadata = [file.name]
|
131 |
-
with col1:
|
132 |
-
with st.spinner("GPT4 at work"):
|
133 |
-
pdf_text = str(get_pdf_text(file))
|
134 |
-
prompt_1 = auftrag_0 + auftrag_1_disziplin + str(Baubranchen_Disziplinen) + pdf_text
|
135 |
-
answer_1 = gpt4_new(prompt_1)
|
136 |
-
print(prompt_1)
|
137 |
-
metadata.append(answer_1)
|
138 |
-
st.write(answer_1)
|
139 |
-
|
140 |
-
with col2:
|
141 |
-
with st.spinner("GPT4 at work"):
|
142 |
-
prompt_2 = auftrag_0 + auftrag_1_type + str(Dokumententypen) + pdf_text
|
143 |
-
answer_2 = gpt4_new(prompt_2)
|
144 |
-
print(prompt_2)
|
145 |
-
metadata.append(answer_2)
|
146 |
-
|
147 |
-
st.write(answer_2)
|
148 |
-
|
149 |
-
with col3:
|
150 |
-
with st.spinner("GPT4 at work"):
|
151 |
-
prompt_3 = auftrag_0 + auftrag_1_ge + str(ASH_Geschosse) + pdf_text
|
152 |
-
answer_3 = gpt4_new(prompt_3)
|
153 |
-
print(prompt_3)
|
154 |
-
metadata.append(answer_2)
|
155 |
-
|
156 |
-
st.write(answer_3)
|
157 |
-
|
158 |
-
all_metadata.append(metadata)
|
159 |
-
|
160 |
-
metadata_filename = "ai_generated_metadata.txt"
|
161 |
-
with open(metadata_filename, 'w', encoding='utf-8') as f:
|
162 |
-
for line in all_metadata:
|
163 |
-
f.writelines("\n")
|
164 |
-
for item in line:
|
165 |
-
f.writelines(item)
|
166 |
-
f.writelines(";")
|
167 |
-
|
168 |
-
f.writelines("\n")
|
169 |
-
|
170 |
-
st.success("classified, saved")
|
171 |
-
st.download_button(f"Download Metadata", json_open(metadata_filename), file_name=metadata_filename)
|
172 |
-
else:
|
173 |
-
st.warning("no file")
|
174 |
-
|
175 |
-
|
176 |
-
if __name__ == "__main__":
|
177 |
-
#prompts = ["classify the document, tell me the ", "hello"]
|
178 |
-
#process_prompts_and_save(prompts)
|
179 |
-
auftrag_0 = "Klassifiziere dieses Dokument nach "
|
180 |
-
auftrag_1_disziplin = "diesen 'Baubranchen Disziplinen': "
|
181 |
-
auftrag_1_type = "diesen 'Dokumententypen': "
|
182 |
-
auftrag_1_ge = "diesen 'Geschossen': "
|
183 |
-
Baubranchen_Disziplinen = ['A-Architektur', 'B-Bauphysik', 'C-Rohrpostanlagen', 'D-Datennetz', 'E-Elektroanlagen',
|
184 |
-
'F-Fassadenplanung', 'G-Küche', 'H-Heizung', 'I-Innenausbau', 'K-Kälte', 'L-Lüftung',
|
185 |
-
'M-Medizintechnik', 'N-Fördertechnik', 'O-Gebäudebetrieb', 'P-Sprinkler',
|
186 |
-
'Q-Brandschutz', 'R-Koordination', 'S-Sanitär', 'T-Tragwerksplanung', 'W-Informatik',
|
187 |
-
'Z-Lichtplanung']
|
188 |
-
auftrag_2 = "gib nur den am besten passendsten Eintrag zurück. " \
|
189 |
-
"Keine weiteren Ausführungen oder Erklärungen. " \
|
190 |
-
"Antworte am besten in einem Wort. " \
|
191 |
-
"Hier der Dokumenteninhalt: "
|
192 |
-
Dokumententypen = ['Fotodokumentation', 'Projektdokumentation (PD)', 'Objektdokumentation (OD)',
|
193 |
-
'Prozessdokumentation', 'Fachdokumentation', 'Anlagedokumentation']
|
194 |
-
ASH_Geschosse = ['U4', 'U3', 'U2', 'U1',
|
195 |
-
'A', 'B', 'C', 'D', 'E', 'F', 'G']
|
196 |
-
#print(str(Baubranchen_Disziplinen))
|
197 |
-
main()
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import os
|
3 |
+
# import openai
|
4 |
+
from PyPDF2 import PdfReader
|
5 |
+
from openai import OpenAI
|
6 |
+
from langchain.chat_models import ChatOpenAI
|
7 |
+
|
8 |
+
ASK_ASH_PASSWORD = os.environ["ASK_ASH_PASSWORD"]
|
9 |
+
|
10 |
+
|
11 |
+
def gpt4_new(prompt_text):
|
12 |
+
client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))
|
13 |
+
response = client.chat.completions.create(
|
14 |
+
model="gpt-4",
|
15 |
+
messages=[{"role": "system",
|
16 |
+
"content": "Du bist eine Maschine, auf Grund des Texts von PDF-Dokumenten,"
|
17 |
+
"das Dokument in vorgegebene Kategorien klassifiziert."
|
18 |
+
"Du gibts möglichst kurze Antworten, am besten ein Wort"
|
19 |
+
"Du gibst keine Erklärungen oder Begründungen. "
|
20 |
+
"Du klassifizierst nur nach den vorgegebenen Kategorien."
|
21 |
+
"Wenn ein Dokument partout nicht klassifizierbar ist, "
|
22 |
+
"antwortest du mit '<no classification>'"},
|
23 |
+
{"role": "user", "content": prompt_text}])
|
24 |
+
return response.choices[0].message.content
|
25 |
+
|
26 |
+
|
27 |
+
# Define a function to ask a question to GPT-4
|
28 |
+
def ask_gpt4(question):
|
29 |
+
print(question) # we don't have to submit the question?
|
30 |
+
try:
|
31 |
+
# Use the chat function to send a message and get a response
|
32 |
+
response = ChatOpenAI()
|
33 |
+
# Extract the response text
|
34 |
+
return response["choices"][0]["message"]["content"]
|
35 |
+
except Exception as e:
|
36 |
+
# Handle exceptions that may occur during the API call
|
37 |
+
return str(e)
|
38 |
+
|
39 |
+
|
40 |
+
def process_prompts_and_save(my_prompts):
|
41 |
+
# Ensure the responses list is empty initially
|
42 |
+
responses = []
|
43 |
+
|
44 |
+
# Loop through each prompt in the list
|
45 |
+
for prompt in my_prompts:
|
46 |
+
try:
|
47 |
+
# ADD LOGIC TO READ FILE AND CLASSIFY
|
48 |
+
# Generate response for each prompt and append to the list
|
49 |
+
response = ask_gpt4(prompt)
|
50 |
+
sol = f"{prompt}\n\n{response}\n\n\n\n"
|
51 |
+
print(sol)
|
52 |
+
responses.append(sol)
|
53 |
+
except Exception as e:
|
54 |
+
# In case of an error, log the error with the prompt
|
55 |
+
responses.append(f"{prompt}\n\nError:{str(e)}\n\n\n\n")
|
56 |
+
|
57 |
+
# Writing all responses to a text file
|
58 |
+
with open('gpt4_responses.txt', 'w', encoding='utf-8') as file:
|
59 |
+
file.writelines(responses)
|
60 |
+
|
61 |
+
|
62 |
+
def get_pdfs_text(pdf_docs):
|
63 |
+
text = ""
|
64 |
+
for pdf in pdf_docs:
|
65 |
+
pdf_reader = PdfReader(pdf)
|
66 |
+
for page in pdf_reader.pages:
|
67 |
+
text += page.extract_text()
|
68 |
+
return text
|
69 |
+
|
70 |
+
|
71 |
+
def get_pdf_text(pdf_document):
|
72 |
+
text = ""
|
73 |
+
pdf_reader = PdfReader(pdf_document)
|
74 |
+
for page in pdf_reader.pages:
|
75 |
+
text += page.extract_text()
|
76 |
+
return text
|
77 |
+
|
78 |
+
|
79 |
+
def json_open(filename):
|
80 |
+
with open(filename, "r") as f:
|
81 |
+
mydata = f.read()
|
82 |
+
return mydata
|
83 |
+
|
84 |
+
|
85 |
+
def main():
|
86 |
+
st.title("Doc Classifier")
|
87 |
+
l, r = st.columns(2)
|
88 |
+
if st.toggle("show README"):
|
89 |
+
st.subheader("Funktion: ")
|
90 |
+
st.write("der Doc Classifier von Elia Wäfler kann einige der BIM2FM Dokumente")
|
91 |
+
st.write("des ASH nach Disziplin, Doc typ. und Geschoss (später KBOB) klassifizieren.")
|
92 |
+
st.write("lade ein oder mehrere PDF-Dokumente hoch, um es auszuprobieren.")
|
93 |
+
st.write("Feedback und Bugs gerne an elia.waefler@insel.ch")
|
94 |
+
st.write("Vielen Dank.")
|
95 |
+
st.write("")
|
96 |
+
with l:
|
97 |
+
st.subheader("Limitationen: ")
|
98 |
+
st.write("bisher nur PDFs")
|
99 |
+
st.write("nur Disziplin, Doc typ. und Geschoss")
|
100 |
+
st.write("macht teilweise Fehler, vor allem bei Koordination, Datennetz usw, (unklare Disziplinen)")
|
101 |
+
st.write("")
|
102 |
+
with r:
|
103 |
+
st.subheader("geplante Erweiterungen:")
|
104 |
+
st.write("Text Beschreibung wird von AI hinzugefügt")
|
105 |
+
st.write("jpg, bilder, tabellen, .xlsx, .docx alles möglich, nicht nur PDF/Text")
|
106 |
+
st.write("Ecodomus API einbinden, um alle Dokumente zu überprüfen.")
|
107 |
+
|
108 |
+
if st.text_input("ASK_ASH_PASSWORD: ", type="password") == ASK_ASH_PASSWORD:
|
109 |
+
uploaded_files = st.file_uploader("PDF Dokument", accept_multiple_files=True)
|
110 |
+
#print(uploaded_file)
|
111 |
+
#print(uploaded_file.name)
|
112 |
+
|
113 |
+
if st.button("classify KBOB!"):
|
114 |
+
if uploaded_files is not None:
|
115 |
+
with st.container():
|
116 |
+
# col1, col2, col3, col4, col5 = st.columns(5)
|
117 |
+
col1, col2, col3 = st.columns(3)
|
118 |
+
all_metadata = []
|
119 |
+
with col1:
|
120 |
+
st.write("Disziplin")
|
121 |
+
st.write(f"")
|
122 |
+
with col2:
|
123 |
+
st.write("Dokumententyp")
|
124 |
+
st.write(f"")
|
125 |
+
with col3:
|
126 |
+
st.write("Geschoss")
|
127 |
+
st.write(f"")
|
128 |
+
|
129 |
+
for file in uploaded_files:
|
130 |
+
metadata = [file.name]
|
131 |
+
with col1:
|
132 |
+
with st.spinner("GPT4 at work"):
|
133 |
+
pdf_text = str(get_pdf_text(file))
|
134 |
+
prompt_1 = auftrag_0 + auftrag_1_disziplin + str(Baubranchen_Disziplinen) + pdf_text
|
135 |
+
answer_1 = gpt4_new(prompt_1)
|
136 |
+
print(prompt_1)
|
137 |
+
metadata.append(answer_1)
|
138 |
+
st.write(answer_1)
|
139 |
+
|
140 |
+
with col2:
|
141 |
+
with st.spinner("GPT4 at work"):
|
142 |
+
prompt_2 = auftrag_0 + auftrag_1_type + str(Dokumententypen) + pdf_text
|
143 |
+
answer_2 = gpt4_new(prompt_2)
|
144 |
+
print(prompt_2)
|
145 |
+
metadata.append(answer_2)
|
146 |
+
|
147 |
+
st.write(answer_2)
|
148 |
+
|
149 |
+
with col3:
|
150 |
+
with st.spinner("GPT4 at work"):
|
151 |
+
prompt_3 = auftrag_0 + auftrag_1_ge + str(ASH_Geschosse) + pdf_text
|
152 |
+
answer_3 = gpt4_new(prompt_3)
|
153 |
+
print(prompt_3)
|
154 |
+
metadata.append(answer_2)
|
155 |
+
|
156 |
+
st.write(answer_3)
|
157 |
+
|
158 |
+
all_metadata.append(metadata)
|
159 |
+
|
160 |
+
metadata_filename = "ai_generated_metadata.txt"
|
161 |
+
with open(metadata_filename, 'w', encoding='utf-8') as f:
|
162 |
+
for line in all_metadata:
|
163 |
+
f.writelines("\n")
|
164 |
+
for item in line:
|
165 |
+
f.writelines(item)
|
166 |
+
f.writelines(";")
|
167 |
+
|
168 |
+
f.writelines("\n")
|
169 |
+
|
170 |
+
st.success("classified, saved")
|
171 |
+
st.download_button(f"Download Metadata", json_open(metadata_filename), file_name=metadata_filename)
|
172 |
+
else:
|
173 |
+
st.warning("no file")
|
174 |
+
|
175 |
+
|
176 |
+
if __name__ == "__main__":
|
177 |
+
#prompts = ["classify the document, tell me the ", "hello"]
|
178 |
+
#process_prompts_and_save(prompts)
|
179 |
+
auftrag_0 = "Klassifiziere dieses Dokument nach "
|
180 |
+
auftrag_1_disziplin = "diesen 'Baubranchen Disziplinen': "
|
181 |
+
auftrag_1_type = "diesen 'Dokumententypen': "
|
182 |
+
auftrag_1_ge = "diesen 'Geschossen': "
|
183 |
+
Baubranchen_Disziplinen = ['A-Architektur', 'B-Bauphysik', 'C-Rohrpostanlagen', 'D-Datennetz', 'E-Elektroanlagen',
|
184 |
+
'F-Fassadenplanung', 'G-Küche', 'H-Heizung', 'I-Innenausbau', 'K-Kälte', 'L-Lüftung',
|
185 |
+
'M-Medizintechnik', 'N-Fördertechnik', 'O-Gebäudebetrieb', 'P-Sprinkler',
|
186 |
+
'Q-Brandschutz', 'R-Koordination', 'S-Sanitär', 'T-Tragwerksplanung', 'W-Informatik',
|
187 |
+
'Z-Lichtplanung']
|
188 |
+
auftrag_2 = "gib nur den am besten passendsten Eintrag zurück. " \
|
189 |
+
"Keine weiteren Ausführungen oder Erklärungen. " \
|
190 |
+
"Antworte am besten in einem Wort. " \
|
191 |
+
"Hier der Dokumenteninhalt: "
|
192 |
+
Dokumententypen = ['Fotodokumentation', 'Projektdokumentation (PD)', 'Objektdokumentation (OD)',
|
193 |
+
'Prozessdokumentation', 'Fachdokumentation', 'Anlagedokumentation']
|
194 |
+
ASH_Geschosse = ['U4', 'U3', 'U2', 'U1',
|
195 |
+
'A', 'B', 'C', 'D', 'E', 'F', 'G']
|
196 |
+
#print(str(Baubranchen_Disziplinen))
|
197 |
+
main()
|
ingest.py → util/ingest.py
RENAMED
@@ -1,126 +1,126 @@
|
|
1 |
-
from PyPDF2 import PdfReader
|
2 |
-
from langchain.text_splitter import CharacterTextSplitter
|
3 |
-
import tabula
|
4 |
-
import io
|
5 |
-
import fitz # PyMuPDF
|
6 |
-
import pdfplumber
|
7 |
-
from PIL import Image
|
8 |
-
import io
|
9 |
-
|
10 |
-
|
11 |
-
def get_pdf_tables(pdf_bytes):
|
12 |
-
"""
|
13 |
-
Extracts tables from a PDF file loaded directly from bytes.
|
14 |
-
|
15 |
-
Args:
|
16 |
-
pdf_bytes (bytes): The byte content of the PDF file.
|
17 |
-
|
18 |
-
Returns:
|
19 |
-
List[pandas.DataFrame]: A list of DataFrames, each representing a table extracted from the PDF.
|
20 |
-
"""
|
21 |
-
tables = []
|
22 |
-
with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
|
23 |
-
for page in pdf.pages:
|
24 |
-
# Extract tables from the current page
|
25 |
-
page_tables = page.extract_tables()
|
26 |
-
for table in page_tables:
|
27 |
-
# Convert table to a DataFrame and append to the list
|
28 |
-
tables.append(table)
|
29 |
-
|
30 |
-
# Optionally convert lists of lists (tables) to pandas DataFrames
|
31 |
-
import pandas as pd
|
32 |
-
dataframes = [pd.DataFrame(table[1:], columns=table[0]) for table in tables if table]
|
33 |
-
return dataframes
|
34 |
-
|
35 |
-
|
36 |
-
def get_pdf_images(pdf_bytes):
|
37 |
-
"""
|
38 |
-
Extracts images and captures screenshots of each page from a given PDF's bytes.
|
39 |
-
|
40 |
-
Args:
|
41 |
-
pdf_bytes (bytes): The byte content of the PDF file.
|
42 |
-
|
43 |
-
Returns:
|
44 |
-
List[bytes]: A list of image bytes extracted from the PDF, including screenshots of each page.
|
45 |
-
"""
|
46 |
-
images = []
|
47 |
-
pdf_stream = io.BytesIO(pdf_bytes)
|
48 |
-
doc = fitz.open("pdf", pdf_stream.read())
|
49 |
-
|
50 |
-
for page_num, page in enumerate(doc):
|
51 |
-
# Take a screenshot of the current page
|
52 |
-
pix = page.get_pixmap() # This line captures the page as an image
|
53 |
-
img_bytes = pix.tobytes("png") # Save the pixmap as PNG bytes
|
54 |
-
images.append(img_bytes) # Append the screenshot to the list of images
|
55 |
-
|
56 |
-
# Extract embedded images
|
57 |
-
for img_index, img in enumerate(page.get_images(full=True)):
|
58 |
-
xref = img[0]
|
59 |
-
base_image = doc.extract_image(xref)
|
60 |
-
image_bytes = base_image["image"]
|
61 |
-
images.append(image_bytes)
|
62 |
-
|
63 |
-
doc.close()
|
64 |
-
return images
|
65 |
-
|
66 |
-
|
67 |
-
def get_pdf_old_tables(pdf_bytes):
|
68 |
-
"""
|
69 |
-
Extracts tables from a given PDF's bytes using Tabula.
|
70 |
-
Args:
|
71 |
-
pdf_bytes (bytes): The byte content of the PDF file.
|
72 |
-
|
73 |
-
Returns:
|
74 |
-
List[pandas.DataFrame]: A list of DataFrames, each representing a table extracted from the PDF.
|
75 |
-
"""
|
76 |
-
pdf_stream = io.BytesIO(pdf_bytes)
|
77 |
-
# Read PDF into list of DataFrame
|
78 |
-
tables = tabula.read_pdf(pdf_stream, pages='all', multiple_tables=True)
|
79 |
-
return tables
|
80 |
-
|
81 |
-
|
82 |
-
def get_pdf_text(pdf_docs):
|
83 |
-
text = ""
|
84 |
-
if type(pdf_docs) == list:
|
85 |
-
for pdf in pdf_docs:
|
86 |
-
pdf_reader = PdfReader(pdf)
|
87 |
-
for page in pdf_reader.pages:
|
88 |
-
text += page.extract_text()
|
89 |
-
else:
|
90 |
-
pdf_reader = PdfReader(pdf_docs)
|
91 |
-
for page in pdf_reader.pages:
|
92 |
-
text += page.extract_text()
|
93 |
-
return text
|
94 |
-
|
95 |
-
|
96 |
-
def get_text_chunks(text):
|
97 |
-
text_splitter = CharacterTextSplitter(
|
98 |
-
separator="\n",
|
99 |
-
chunk_size=1000,
|
100 |
-
chunk_overlap=200,
|
101 |
-
length_function=len
|
102 |
-
)
|
103 |
-
chunks = text_splitter.split_text(text)
|
104 |
-
return chunks
|
105 |
-
|
106 |
-
|
107 |
-
def extract_images_from_pdf_path(pdf_path):
|
108 |
-
doc = fitz.open(pdf_path)
|
109 |
-
images = []
|
110 |
-
for i in range(len(doc)):
|
111 |
-
for img in doc.get_page_images(i):
|
112 |
-
xref = img[0]
|
113 |
-
base = img[1]
|
114 |
-
img_data = doc.extract_image(xref)
|
115 |
-
img_bytes = img_data['image']
|
116 |
-
|
117 |
-
image = Image.open(io.BytesIO(img_bytes))
|
118 |
-
images.append(image)
|
119 |
-
|
120 |
-
return images
|
121 |
-
|
122 |
-
|
123 |
-
def get_tables_from_pdf_path(pdf_path):
|
124 |
-
# read_pdf will save the pdf table into Pandas Dataframe
|
125 |
-
tables = tabula.read_pdf(pdf_path, pages='all')
|
126 |
-
return tables
|
|
|
1 |
+
from PyPDF2 import PdfReader
|
2 |
+
from langchain.text_splitter import CharacterTextSplitter
|
3 |
+
import tabula
|
4 |
+
import io
|
5 |
+
import fitz # PyMuPDF
|
6 |
+
import pdfplumber
|
7 |
+
from PIL import Image
|
8 |
+
import io
|
9 |
+
|
10 |
+
|
11 |
+
def get_pdf_tables(pdf_bytes):
|
12 |
+
"""
|
13 |
+
Extracts tables from a PDF file loaded directly from bytes.
|
14 |
+
|
15 |
+
Args:
|
16 |
+
pdf_bytes (bytes): The byte content of the PDF file.
|
17 |
+
|
18 |
+
Returns:
|
19 |
+
List[pandas.DataFrame]: A list of DataFrames, each representing a table extracted from the PDF.
|
20 |
+
"""
|
21 |
+
tables = []
|
22 |
+
with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
|
23 |
+
for page in pdf.pages:
|
24 |
+
# Extract tables from the current page
|
25 |
+
page_tables = page.extract_tables()
|
26 |
+
for table in page_tables:
|
27 |
+
# Convert table to a DataFrame and append to the list
|
28 |
+
tables.append(table)
|
29 |
+
|
30 |
+
# Optionally convert lists of lists (tables) to pandas DataFrames
|
31 |
+
import pandas as pd
|
32 |
+
dataframes = [pd.DataFrame(table[1:], columns=table[0]) for table in tables if table]
|
33 |
+
return dataframes
|
34 |
+
|
35 |
+
|
36 |
+
def get_pdf_images(pdf_bytes):
|
37 |
+
"""
|
38 |
+
Extracts images and captures screenshots of each page from a given PDF's bytes.
|
39 |
+
|
40 |
+
Args:
|
41 |
+
pdf_bytes (bytes): The byte content of the PDF file.
|
42 |
+
|
43 |
+
Returns:
|
44 |
+
List[bytes]: A list of image bytes extracted from the PDF, including screenshots of each page.
|
45 |
+
"""
|
46 |
+
images = []
|
47 |
+
pdf_stream = io.BytesIO(pdf_bytes)
|
48 |
+
doc = fitz.open("pdf", pdf_stream.read())
|
49 |
+
|
50 |
+
for page_num, page in enumerate(doc):
|
51 |
+
# Take a screenshot of the current page
|
52 |
+
pix = page.get_pixmap() # This line captures the page as an image
|
53 |
+
img_bytes = pix.tobytes("png") # Save the pixmap as PNG bytes
|
54 |
+
images.append(img_bytes) # Append the screenshot to the list of images
|
55 |
+
|
56 |
+
# Extract embedded images
|
57 |
+
for img_index, img in enumerate(page.get_images(full=True)):
|
58 |
+
xref = img[0]
|
59 |
+
base_image = doc.extract_image(xref)
|
60 |
+
image_bytes = base_image["image"]
|
61 |
+
images.append(image_bytes)
|
62 |
+
|
63 |
+
doc.close()
|
64 |
+
return images
|
65 |
+
|
66 |
+
|
67 |
+
def get_pdf_old_tables(pdf_bytes):
|
68 |
+
"""
|
69 |
+
Extracts tables from a given PDF's bytes using Tabula.
|
70 |
+
Args:
|
71 |
+
pdf_bytes (bytes): The byte content of the PDF file.
|
72 |
+
|
73 |
+
Returns:
|
74 |
+
List[pandas.DataFrame]: A list of DataFrames, each representing a table extracted from the PDF.
|
75 |
+
"""
|
76 |
+
pdf_stream = io.BytesIO(pdf_bytes)
|
77 |
+
# Read PDF into list of DataFrame
|
78 |
+
tables = tabula.read_pdf(pdf_stream, pages='all', multiple_tables=True)
|
79 |
+
return tables
|
80 |
+
|
81 |
+
|
82 |
+
def get_pdf_text(pdf_docs):
|
83 |
+
text = ""
|
84 |
+
if type(pdf_docs) == list:
|
85 |
+
for pdf in pdf_docs:
|
86 |
+
pdf_reader = PdfReader(pdf)
|
87 |
+
for page in pdf_reader.pages:
|
88 |
+
text += page.extract_text()
|
89 |
+
else:
|
90 |
+
pdf_reader = PdfReader(pdf_docs)
|
91 |
+
for page in pdf_reader.pages:
|
92 |
+
text += page.extract_text()
|
93 |
+
return text
|
94 |
+
|
95 |
+
|
96 |
+
def get_text_chunks(text):
|
97 |
+
text_splitter = CharacterTextSplitter(
|
98 |
+
separator="\n",
|
99 |
+
chunk_size=1000,
|
100 |
+
chunk_overlap=200,
|
101 |
+
length_function=len
|
102 |
+
)
|
103 |
+
chunks = text_splitter.split_text(text)
|
104 |
+
return chunks
|
105 |
+
|
106 |
+
|
107 |
+
def extract_images_from_pdf_path(pdf_path):
|
108 |
+
doc = fitz.open(pdf_path)
|
109 |
+
images = []
|
110 |
+
for i in range(len(doc)):
|
111 |
+
for img in doc.get_page_images(i):
|
112 |
+
xref = img[0]
|
113 |
+
base = img[1]
|
114 |
+
img_data = doc.extract_image(xref)
|
115 |
+
img_bytes = img_data['image']
|
116 |
+
|
117 |
+
image = Image.open(io.BytesIO(img_bytes))
|
118 |
+
images.append(image)
|
119 |
+
|
120 |
+
return images
|
121 |
+
|
122 |
+
|
123 |
+
def get_tables_from_pdf_path(pdf_path):
|
124 |
+
# read_pdf will save the pdf table into Pandas Dataframe
|
125 |
+
tables = tabula.read_pdf(pdf_path, pages='all')
|
126 |
+
return tables
|
my_1_reader.py → util/my_1_reader.py
RENAMED
@@ -1,201 +1,201 @@
|
|
1 |
-
# MUSS AUFGERÄUMT WERDEN
|
2 |
-
|
3 |
-
import json
|
4 |
-
import os
|
5 |
-
import subprocess
|
6 |
-
import PyPDF2
|
7 |
-
import csv
|
8 |
-
import fitz # PyMuPDF
|
9 |
-
|
10 |
-
|
11 |
-
def extract_text_from_pdf(pdf_path):
|
12 |
-
"""
|
13 |
-
Extracts all text from a PDF file.
|
14 |
-
|
15 |
-
:param pdf_path: Path to the PDF file.
|
16 |
-
:return: Extracted text as a string.
|
17 |
-
"""
|
18 |
-
# Open the PDF file
|
19 |
-
doc = fitz.open(pdf_path)
|
20 |
-
|
21 |
-
# Initialize an empty string to hold the text
|
22 |
-
text = ''
|
23 |
-
|
24 |
-
# Iterate through each page in the PDF
|
25 |
-
for page_num in range(len(doc)):
|
26 |
-
# Get a page
|
27 |
-
page = doc.load_page(page_num)
|
28 |
-
|
29 |
-
# Extract text from the page and add it to the result
|
30 |
-
text += page.get_text()
|
31 |
-
|
32 |
-
# Close the document
|
33 |
-
doc.close()
|
34 |
-
|
35 |
-
return text
|
36 |
-
|
37 |
-
|
38 |
-
def read_pdfs_from_folder(folder_path):
|
39 |
-
"""
|
40 |
-
Reads all PDF files in the specified folder using PdfReader and extracts their text.
|
41 |
-
|
42 |
-
Parameters:
|
43 |
-
- folder_path: The path to the folder containing PDF files.
|
44 |
-
|
45 |
-
Returns:
|
46 |
-
- A dictionary with file names as keys and their extracted text as values.
|
47 |
-
"""
|
48 |
-
pdf_texts = {}
|
49 |
-
for filename in os.listdir(folder_path):
|
50 |
-
if filename.endswith('.pdf'):
|
51 |
-
file_path = os.path.join(folder_path, filename)
|
52 |
-
with open(file_path, 'rb') as pdf_file:
|
53 |
-
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
54 |
-
text = ''
|
55 |
-
for page in pdf_reader.pages:
|
56 |
-
try:
|
57 |
-
text += page.extract_text()
|
58 |
-
except UnicodeDecodeError as e:
|
59 |
-
print(e)
|
60 |
-
for c in text:
|
61 |
-
if c in ["ä", "Ä"]:
|
62 |
-
text = text[:text.index(c)] + "ae" + text[text.index(c)+1:]
|
63 |
-
if c in ["ö", "Ö"]:
|
64 |
-
text = text[:text.index(c)] + "oe" + text[text.index(c)+1:]
|
65 |
-
if c in ["ü", "Ü"]:
|
66 |
-
text = text[:text.index(c)] + "ue" + text[text.index(c)+1:]
|
67 |
-
if c in [",", ";", "\\", '"']:
|
68 |
-
text = text[:text.index(c)] + "_" + text[text.index(c)+1:]
|
69 |
-
if c in ["/n", "\n"]:
|
70 |
-
text = text[:text.index(c)] + "<newline>" + text[text.index(c) + 1:]
|
71 |
-
pdf_texts[filename] = text
|
72 |
-
return pdf_texts
|
73 |
-
|
74 |
-
|
75 |
-
def read_csv_lines_as_strings(filename):
|
76 |
-
"""
|
77 |
-
Opens a CSV file and returns each line as a string in a list.
|
78 |
-
|
79 |
-
Parameters:
|
80 |
-
- filename: The path to the CSV file.
|
81 |
-
|
82 |
-
Returns:
|
83 |
-
- A list of strings, each representing a line from the CSV file.
|
84 |
-
"""
|
85 |
-
lines_as_strings = []
|
86 |
-
with open(filename, newline='') as csvfile:
|
87 |
-
try:
|
88 |
-
reader = csv.reader(csvfile)
|
89 |
-
for row in reader:
|
90 |
-
# Convert the row (a list of values) back into a comma-separated string
|
91 |
-
line_as_string = ','.join(row)
|
92 |
-
lines_as_strings.append(line_as_string)
|
93 |
-
except UnicodeDecodeError as e:
|
94 |
-
print(e)
|
95 |
-
return lines_as_strings
|
96 |
-
|
97 |
-
|
98 |
-
# Function to load data from JSON files
|
99 |
-
def load_data(filename):
|
100 |
-
with open(filename, 'r') as file:
|
101 |
-
try:
|
102 |
-
return json.load(file)
|
103 |
-
except UnicodeDecodeError as err:
|
104 |
-
print(err)
|
105 |
-
return {}
|
106 |
-
|
107 |
-
|
108 |
-
def find_and_open_file(filename, start_directory):
|
109 |
-
"""
|
110 |
-
Attempts to open a file with the given filename starting from the specified directory.
|
111 |
-
If the file is not found, searches recursively in all subfolders. Works across macOS, Linux, and Windows.
|
112 |
-
"""
|
113 |
-
for root, dirs, files in os.walk(start_directory):
|
114 |
-
if filename in files:
|
115 |
-
filepath = os.path.join(root, filename)
|
116 |
-
print(f"File found: {filepath}")
|
117 |
-
return filepath
|
118 |
-
print(f"File {filename} not found.")
|
119 |
-
return None
|
120 |
-
|
121 |
-
|
122 |
-
def open_file(filepath):
|
123 |
-
"""
|
124 |
-
Opens the file with the default application, based on the operating system.
|
125 |
-
"""
|
126 |
-
if os.path.exists(filepath):
|
127 |
-
if os.name == 'posix': # Linux, macOS, etc.
|
128 |
-
subprocess.call(('open', filepath))
|
129 |
-
elif os.name == 'nt': # Windows
|
130 |
-
os.startfile(filepath)
|
131 |
-
else:
|
132 |
-
print(f"Cannot open file on this operating system: {filepath}")
|
133 |
-
else:
|
134 |
-
print(f"File does not exist: {filepath}")
|
135 |
-
|
136 |
-
|
137 |
-
def list_folders_files_recursive(path, depth=0):
|
138 |
-
"""
|
139 |
-
Recursively lists all folders and files within the specified path, including subfolders.
|
140 |
-
|
141 |
-
Parameters:
|
142 |
-
- path: The directory path to list contents from.
|
143 |
-
- depth: The current depth of recursion (used for indentation in print statements).
|
144 |
-
|
145 |
-
Returns:
|
146 |
-
- None
|
147 |
-
"""
|
148 |
-
# Ensure the provided path is a directory
|
149 |
-
if not os.path.isdir(path):
|
150 |
-
print(f"The provided path '{path}' is not a valid directory.")
|
151 |
-
return
|
152 |
-
|
153 |
-
indent = ' ' * depth # Indentation based on recursion depth
|
154 |
-
folders, files = [], []
|
155 |
-
|
156 |
-
# List all entries in the directory
|
157 |
-
for entry in os.listdir(path):
|
158 |
-
full_path = os.path.join(path, entry)
|
159 |
-
if os.path.isdir(full_path):
|
160 |
-
folders.append(entry)
|
161 |
-
print(f"{indent}Folder: {entry}")
|
162 |
-
# Recursively list subfolders and files
|
163 |
-
list_folders_files_recursive(full_path, depth + 1)
|
164 |
-
elif os.path.isfile(full_path):
|
165 |
-
files.append(entry)
|
166 |
-
|
167 |
-
for f in files:
|
168 |
-
print(f"{indent}File: {f}")
|
169 |
-
|
170 |
-
|
171 |
-
def list_folders_files(path):
|
172 |
-
"""
|
173 |
-
Lists all folders and files within the specified path.
|
174 |
-
|
175 |
-
Parameters:
|
176 |
-
- path: The directory path to list contents from.
|
177 |
-
|
178 |
-
Returns:
|
179 |
-
- A tuple of two lists: (folders, files).
|
180 |
-
"""
|
181 |
-
folders = []
|
182 |
-
files = []
|
183 |
-
|
184 |
-
# Ensure the provided path is a directory
|
185 |
-
if not os.path.isdir(path):
|
186 |
-
print(f"The provided path '{path}' is not a valid directory.")
|
187 |
-
return folders, files
|
188 |
-
|
189 |
-
# List all entries in the directory
|
190 |
-
for entry in os.listdir(path):
|
191 |
-
full_path = os.path.join(path, entry)
|
192 |
-
if os.path.isdir(full_path):
|
193 |
-
folders.append(entry)
|
194 |
-
elif os.path.isfile(full_path):
|
195 |
-
files.append(entry)
|
196 |
-
|
197 |
-
return folders, files
|
198 |
-
|
199 |
-
|
200 |
-
if __name__ == "__main__":
|
201 |
-
print("here are all functions that read files")
|
|
|
1 |
+
# MUSS AUFGERÄUMT WERDEN
|
2 |
+
|
3 |
+
import json
|
4 |
+
import os
|
5 |
+
import subprocess
|
6 |
+
import PyPDF2
|
7 |
+
import csv
|
8 |
+
import fitz # PyMuPDF
|
9 |
+
|
10 |
+
|
11 |
+
def extract_text_from_pdf(pdf_path):
|
12 |
+
"""
|
13 |
+
Extracts all text from a PDF file.
|
14 |
+
|
15 |
+
:param pdf_path: Path to the PDF file.
|
16 |
+
:return: Extracted text as a string.
|
17 |
+
"""
|
18 |
+
# Open the PDF file
|
19 |
+
doc = fitz.open(pdf_path)
|
20 |
+
|
21 |
+
# Initialize an empty string to hold the text
|
22 |
+
text = ''
|
23 |
+
|
24 |
+
# Iterate through each page in the PDF
|
25 |
+
for page_num in range(len(doc)):
|
26 |
+
# Get a page
|
27 |
+
page = doc.load_page(page_num)
|
28 |
+
|
29 |
+
# Extract text from the page and add it to the result
|
30 |
+
text += page.get_text()
|
31 |
+
|
32 |
+
# Close the document
|
33 |
+
doc.close()
|
34 |
+
|
35 |
+
return text
|
36 |
+
|
37 |
+
|
38 |
+
def read_pdfs_from_folder(folder_path):
|
39 |
+
"""
|
40 |
+
Reads all PDF files in the specified folder using PdfReader and extracts their text.
|
41 |
+
|
42 |
+
Parameters:
|
43 |
+
- folder_path: The path to the folder containing PDF files.
|
44 |
+
|
45 |
+
Returns:
|
46 |
+
- A dictionary with file names as keys and their extracted text as values.
|
47 |
+
"""
|
48 |
+
pdf_texts = {}
|
49 |
+
for filename in os.listdir(folder_path):
|
50 |
+
if filename.endswith('.pdf'):
|
51 |
+
file_path = os.path.join(folder_path, filename)
|
52 |
+
with open(file_path, 'rb') as pdf_file:
|
53 |
+
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
54 |
+
text = ''
|
55 |
+
for page in pdf_reader.pages:
|
56 |
+
try:
|
57 |
+
text += page.extract_text()
|
58 |
+
except UnicodeDecodeError as e:
|
59 |
+
print(e)
|
60 |
+
for c in text:
|
61 |
+
if c in ["ä", "Ä"]:
|
62 |
+
text = text[:text.index(c)] + "ae" + text[text.index(c)+1:]
|
63 |
+
if c in ["ö", "Ö"]:
|
64 |
+
text = text[:text.index(c)] + "oe" + text[text.index(c)+1:]
|
65 |
+
if c in ["ü", "Ü"]:
|
66 |
+
text = text[:text.index(c)] + "ue" + text[text.index(c)+1:]
|
67 |
+
if c in [",", ";", "\\", '"']:
|
68 |
+
text = text[:text.index(c)] + "_" + text[text.index(c)+1:]
|
69 |
+
if c in ["/n", "\n"]:
|
70 |
+
text = text[:text.index(c)] + "<newline>" + text[text.index(c) + 1:]
|
71 |
+
pdf_texts[filename] = text
|
72 |
+
return pdf_texts
|
73 |
+
|
74 |
+
|
75 |
+
def read_csv_lines_as_strings(filename):
|
76 |
+
"""
|
77 |
+
Opens a CSV file and returns each line as a string in a list.
|
78 |
+
|
79 |
+
Parameters:
|
80 |
+
- filename: The path to the CSV file.
|
81 |
+
|
82 |
+
Returns:
|
83 |
+
- A list of strings, each representing a line from the CSV file.
|
84 |
+
"""
|
85 |
+
lines_as_strings = []
|
86 |
+
with open(filename, newline='') as csvfile:
|
87 |
+
try:
|
88 |
+
reader = csv.reader(csvfile)
|
89 |
+
for row in reader:
|
90 |
+
# Convert the row (a list of values) back into a comma-separated string
|
91 |
+
line_as_string = ','.join(row)
|
92 |
+
lines_as_strings.append(line_as_string)
|
93 |
+
except UnicodeDecodeError as e:
|
94 |
+
print(e)
|
95 |
+
return lines_as_strings
|
96 |
+
|
97 |
+
|
98 |
+
# Function to load data from JSON files
|
99 |
+
def load_data(filename):
|
100 |
+
with open(filename, 'r') as file:
|
101 |
+
try:
|
102 |
+
return json.load(file)
|
103 |
+
except UnicodeDecodeError as err:
|
104 |
+
print(err)
|
105 |
+
return {}
|
106 |
+
|
107 |
+
|
108 |
+
def find_and_open_file(filename, start_directory):
|
109 |
+
"""
|
110 |
+
Attempts to open a file with the given filename starting from the specified directory.
|
111 |
+
If the file is not found, searches recursively in all subfolders. Works across macOS, Linux, and Windows.
|
112 |
+
"""
|
113 |
+
for root, dirs, files in os.walk(start_directory):
|
114 |
+
if filename in files:
|
115 |
+
filepath = os.path.join(root, filename)
|
116 |
+
print(f"File found: {filepath}")
|
117 |
+
return filepath
|
118 |
+
print(f"File {filename} not found.")
|
119 |
+
return None
|
120 |
+
|
121 |
+
|
122 |
+
def open_file(filepath):
|
123 |
+
"""
|
124 |
+
Opens the file with the default application, based on the operating system.
|
125 |
+
"""
|
126 |
+
if os.path.exists(filepath):
|
127 |
+
if os.name == 'posix': # Linux, macOS, etc.
|
128 |
+
subprocess.call(('open', filepath))
|
129 |
+
elif os.name == 'nt': # Windows
|
130 |
+
os.startfile(filepath)
|
131 |
+
else:
|
132 |
+
print(f"Cannot open file on this operating system: {filepath}")
|
133 |
+
else:
|
134 |
+
print(f"File does not exist: {filepath}")
|
135 |
+
|
136 |
+
|
137 |
+
def list_folders_files_recursive(path, depth=0):
|
138 |
+
"""
|
139 |
+
Recursively lists all folders and files within the specified path, including subfolders.
|
140 |
+
|
141 |
+
Parameters:
|
142 |
+
- path: The directory path to list contents from.
|
143 |
+
- depth: The current depth of recursion (used for indentation in print statements).
|
144 |
+
|
145 |
+
Returns:
|
146 |
+
- None
|
147 |
+
"""
|
148 |
+
# Ensure the provided path is a directory
|
149 |
+
if not os.path.isdir(path):
|
150 |
+
print(f"The provided path '{path}' is not a valid directory.")
|
151 |
+
return
|
152 |
+
|
153 |
+
indent = ' ' * depth # Indentation based on recursion depth
|
154 |
+
folders, files = [], []
|
155 |
+
|
156 |
+
# List all entries in the directory
|
157 |
+
for entry in os.listdir(path):
|
158 |
+
full_path = os.path.join(path, entry)
|
159 |
+
if os.path.isdir(full_path):
|
160 |
+
folders.append(entry)
|
161 |
+
print(f"{indent}Folder: {entry}")
|
162 |
+
# Recursively list subfolders and files
|
163 |
+
list_folders_files_recursive(full_path, depth + 1)
|
164 |
+
elif os.path.isfile(full_path):
|
165 |
+
files.append(entry)
|
166 |
+
|
167 |
+
for f in files:
|
168 |
+
print(f"{indent}File: {f}")
|
169 |
+
|
170 |
+
|
171 |
+
def list_folders_files(path):
|
172 |
+
"""
|
173 |
+
Lists all folders and files within the specified path.
|
174 |
+
|
175 |
+
Parameters:
|
176 |
+
- path: The directory path to list contents from.
|
177 |
+
|
178 |
+
Returns:
|
179 |
+
- A tuple of two lists: (folders, files).
|
180 |
+
"""
|
181 |
+
folders = []
|
182 |
+
files = []
|
183 |
+
|
184 |
+
# Ensure the provided path is a directory
|
185 |
+
if not os.path.isdir(path):
|
186 |
+
print(f"The provided path '{path}' is not a valid directory.")
|
187 |
+
return folders, files
|
188 |
+
|
189 |
+
# List all entries in the directory
|
190 |
+
for entry in os.listdir(path):
|
191 |
+
full_path = os.path.join(path, entry)
|
192 |
+
if os.path.isdir(full_path):
|
193 |
+
folders.append(entry)
|
194 |
+
elif os.path.isfile(full_path):
|
195 |
+
files.append(entry)
|
196 |
+
|
197 |
+
return folders, files
|
198 |
+
|
199 |
+
|
200 |
+
if __name__ == "__main__":
|
201 |
+
print("here are all functions that read files")
|
my_1_writer.py → util/my_1_writer.py
RENAMED
File without changes
|
my_2_sim_search.py → util/my_2_sim_search.py
RENAMED
@@ -1,163 +1,163 @@
|
|
1 |
-
import my_new_openai
|
2 |
-
import my_1_writer
|
3 |
-
import json
|
4 |
-
import numpy as np
|
5 |
-
|
6 |
-
|
7 |
-
# sim search with dot_product and lin_distance
|
8 |
-
# the newly vectorized TERM will be added to the database
|
9 |
-
# database = .json file
|
10 |
-
def sim_search_load_db(database, term, add_to_db=True, debug=False):
|
11 |
-
if type(term) == str:
|
12 |
-
print("str")
|
13 |
-
vector1 = my_new_openai.vectorize_data(term)
|
14 |
-
elif type(term) == list:
|
15 |
-
print("list")
|
16 |
-
vector1 = term
|
17 |
-
else:
|
18 |
-
print("invalid search_term/search_vector format")
|
19 |
-
return
|
20 |
-
with open(database, "r") as f:
|
21 |
-
table = json.load(f)
|
22 |
-
sim_search_dict = {}
|
23 |
-
for key in table.keys():
|
24 |
-
vector2 = table[key]
|
25 |
-
if debug:
|
26 |
-
print("")
|
27 |
-
print(f"{vector1}")
|
28 |
-
print(f"{vector2}")
|
29 |
-
print(f"doing dot product for {key} and {term}")
|
30 |
-
dp = np.dot(vector1, vector2)
|
31 |
-
distance = np.linalg.norm(np.array(vector1) - np.array(vector2))
|
32 |
-
if debug:
|
33 |
-
print(f"the dp is {dp}")
|
34 |
-
print(f"the distance is{distance}")
|
35 |
-
print("")
|
36 |
-
print("")
|
37 |
-
print("")
|
38 |
-
sim_search_dict[key] = dp * distance
|
39 |
-
|
40 |
-
# sort with the biggest similarity
|
41 |
-
sorted_table = dict(sorted(sim_search_dict.items(), key=lambda item: item[1]), reversed=True)
|
42 |
-
|
43 |
-
if debug:
|
44 |
-
for key, value in sorted_table[:5]:
|
45 |
-
print(f"{key}: {value}")
|
46 |
-
if add_to_db:
|
47 |
-
|
48 |
-
if term in table.keys():
|
49 |
-
print("the search term is in the database!")
|
50 |
-
# add the newly vectorized term to the words, if not already in the vector table
|
51 |
-
else:
|
52 |
-
if database != "session/my_words_vec_table.json":
|
53 |
-
database = "session/my_vecs.json"
|
54 |
-
# table = load_df(database) # ??
|
55 |
-
table[str(term)] = vector1
|
56 |
-
my_1_writer.safe_my_dict_as_json(database, table)
|
57 |
-
# first_key, first_value = list(sortedTable.items())[0]
|
58 |
-
print(f"the closest word to your input is: {list(sorted_table.keys())[0]}")
|
59 |
-
return sorted_table
|
60 |
-
|
61 |
-
|
62 |
-
def dot_p_to_1(database, vector1=0, analysis_filename=0):
|
63 |
-
|
64 |
-
with open(database, "r") as f:
|
65 |
-
table = json.load(f)
|
66 |
-
dot_product_to1 = {}
|
67 |
-
|
68 |
-
if vector1 == 0:
|
69 |
-
vector1 = [0.025515518153991442 for _ in range(1536)]
|
70 |
-
elif vector1 == 1:
|
71 |
-
vector1 = table[str(list(table.keys())[0])]
|
72 |
-
|
73 |
-
for key in table.keys():
|
74 |
-
dot_product_to1[key] = np.dot(vector1, table[key])
|
75 |
-
my_1_writer.safe_my_dict_as_json(analysis_filename, dot_product_to1)
|
76 |
-
print("dot p to 1 saved")
|
77 |
-
|
78 |
-
|
79 |
-
def lin_dist(database, vector1=0, analysis_filename=0):
|
80 |
-
with open(database, "r") as f:
|
81 |
-
table = json.load(f)
|
82 |
-
lin_dist_to_1 = {}
|
83 |
-
|
84 |
-
if vector1 == 0:
|
85 |
-
vector1 = [0.025515518153991442 for _ in range(1536)]
|
86 |
-
elif vector1 == 1:
|
87 |
-
vector1 = table[str(list(table.keys())[0])]
|
88 |
-
|
89 |
-
for key in table.keys():
|
90 |
-
lin_dist_to_1[key] = np.linalg.norm(np.array(vector1) - np.array(table[key]))
|
91 |
-
|
92 |
-
my_1_writer.safe_my_dict_as_json(analysis_filename, lin_dist_to_1)
|
93 |
-
print("lin dist to 1 saved")
|
94 |
-
|
95 |
-
|
96 |
-
def manhattan_dist(database, vector1=0, analysis_filename=0):
|
97 |
-
with open(database, "r") as f:
|
98 |
-
table = json.load(f)
|
99 |
-
manhattan_dist_to_1 = {}
|
100 |
-
|
101 |
-
if vector1 == 0:
|
102 |
-
vector1 = [0.025515518153991442 for _ in range(1536)]
|
103 |
-
elif vector1 == 1:
|
104 |
-
vector1 = table[str(list(table.keys())[0])]
|
105 |
-
|
106 |
-
for key in table.keys():
|
107 |
-
manhattan_dist_to_1[key] = sum(np.array(vector1) - np.array(table[key]))
|
108 |
-
|
109 |
-
my_1_writer.safe_my_dict_as_json(analysis_filename, manhattan_dist_to_1)
|
110 |
-
print("manhattan dist to 1 saved")
|
111 |
-
|
112 |
-
|
113 |
-
#vec_table
|
114 |
-
def sim_search_fly(vec_table, term, debug=False):
|
115 |
-
if debug:
|
116 |
-
print(type(vec_table))
|
117 |
-
print(type(term))
|
118 |
-
print(type(vec_table[list(vec_table.keys())[0]]))
|
119 |
-
print("vec table:")
|
120 |
-
print(vec_table[list(vec_table.keys())[5]][:4])
|
121 |
-
print("search term")
|
122 |
-
print(term[:4])
|
123 |
-
if type(term) == str:
|
124 |
-
print("str")
|
125 |
-
vector1 = my_new_openai.vectorize_data(term)
|
126 |
-
elif type(term) == list:
|
127 |
-
print("list")
|
128 |
-
vector1 = term
|
129 |
-
else:
|
130 |
-
print("invalid search_term/search_vector format")
|
131 |
-
return
|
132 |
-
|
133 |
-
sim_search_dict = {}
|
134 |
-
for key in vec_table.keys():
|
135 |
-
vector2 = vec_table[key]
|
136 |
-
if debug:
|
137 |
-
print("")
|
138 |
-
print(f"{vector1}")
|
139 |
-
print(f"{vector2}")
|
140 |
-
print(f"doing dot product for {key} and {term}")
|
141 |
-
if vector2[0] == vector2[1] and vector2[3] == vector2[4] and vector2[5] == vector2[6]:
|
142 |
-
dp = 200
|
143 |
-
else:
|
144 |
-
dp = np.dot(vector1, vector2)
|
145 |
-
#distance = np.linalg.norm(np.array(vector1) - np.array(vector2))
|
146 |
-
if debug:
|
147 |
-
print(f"the dp is {dp}")
|
148 |
-
#print(f"the distance is{distance}")
|
149 |
-
print("")
|
150 |
-
print("")
|
151 |
-
print("")
|
152 |
-
sim_search_dict[key] = dp #* distance
|
153 |
-
|
154 |
-
# sort with the biggest similarity
|
155 |
-
sorted_table = dict(sorted(sim_search_dict.items(), key=lambda item: item[1]), reversed=True)
|
156 |
-
|
157 |
-
if debug:
|
158 |
-
for key, value in sorted_table[:5]:
|
159 |
-
print(f"{key}: {value}")
|
160 |
-
|
161 |
-
# first_key, first_value = list(sortedTable.items())[0]
|
162 |
-
print(f"the closest word to your input is: {list(sorted_table.keys())[0]}")
|
163 |
-
return sorted_table
|
|
|
1 |
+
import my_new_openai
|
2 |
+
import my_1_writer
|
3 |
+
import json
|
4 |
+
import numpy as np
|
5 |
+
|
6 |
+
|
7 |
+
# sim search with dot_product and lin_distance
|
8 |
+
# the newly vectorized TERM will be added to the database
|
9 |
+
# database = .json file
|
10 |
+
def sim_search_load_db(database, term, add_to_db=True, debug=False):
|
11 |
+
if type(term) == str:
|
12 |
+
print("str")
|
13 |
+
vector1 = my_new_openai.vectorize_data(term)
|
14 |
+
elif type(term) == list:
|
15 |
+
print("list")
|
16 |
+
vector1 = term
|
17 |
+
else:
|
18 |
+
print("invalid search_term/search_vector format")
|
19 |
+
return
|
20 |
+
with open(database, "r") as f:
|
21 |
+
table = json.load(f)
|
22 |
+
sim_search_dict = {}
|
23 |
+
for key in table.keys():
|
24 |
+
vector2 = table[key]
|
25 |
+
if debug:
|
26 |
+
print("")
|
27 |
+
print(f"{vector1}")
|
28 |
+
print(f"{vector2}")
|
29 |
+
print(f"doing dot product for {key} and {term}")
|
30 |
+
dp = np.dot(vector1, vector2)
|
31 |
+
distance = np.linalg.norm(np.array(vector1) - np.array(vector2))
|
32 |
+
if debug:
|
33 |
+
print(f"the dp is {dp}")
|
34 |
+
print(f"the distance is{distance}")
|
35 |
+
print("")
|
36 |
+
print("")
|
37 |
+
print("")
|
38 |
+
sim_search_dict[key] = dp * distance
|
39 |
+
|
40 |
+
# sort with the biggest similarity
|
41 |
+
sorted_table = dict(sorted(sim_search_dict.items(), key=lambda item: item[1]), reversed=True)
|
42 |
+
|
43 |
+
if debug:
|
44 |
+
for key, value in sorted_table[:5]:
|
45 |
+
print(f"{key}: {value}")
|
46 |
+
if add_to_db:
|
47 |
+
|
48 |
+
if term in table.keys():
|
49 |
+
print("the search term is in the database!")
|
50 |
+
# add the newly vectorized term to the words, if not already in the vector table
|
51 |
+
else:
|
52 |
+
if database != "session/my_words_vec_table.json":
|
53 |
+
database = "session/my_vecs.json"
|
54 |
+
# table = load_df(database) # ??
|
55 |
+
table[str(term)] = vector1
|
56 |
+
my_1_writer.safe_my_dict_as_json(database, table)
|
57 |
+
# first_key, first_value = list(sortedTable.items())[0]
|
58 |
+
print(f"the closest word to your input is: {list(sorted_table.keys())[0]}")
|
59 |
+
return sorted_table
|
60 |
+
|
61 |
+
|
62 |
+
def dot_p_to_1(database, vector1=0, analysis_filename=0):
|
63 |
+
|
64 |
+
with open(database, "r") as f:
|
65 |
+
table = json.load(f)
|
66 |
+
dot_product_to1 = {}
|
67 |
+
|
68 |
+
if vector1 == 0:
|
69 |
+
vector1 = [0.025515518153991442 for _ in range(1536)]
|
70 |
+
elif vector1 == 1:
|
71 |
+
vector1 = table[str(list(table.keys())[0])]
|
72 |
+
|
73 |
+
for key in table.keys():
|
74 |
+
dot_product_to1[key] = np.dot(vector1, table[key])
|
75 |
+
my_1_writer.safe_my_dict_as_json(analysis_filename, dot_product_to1)
|
76 |
+
print("dot p to 1 saved")
|
77 |
+
|
78 |
+
|
79 |
+
def lin_dist(database, vector1=0, analysis_filename=0):
|
80 |
+
with open(database, "r") as f:
|
81 |
+
table = json.load(f)
|
82 |
+
lin_dist_to_1 = {}
|
83 |
+
|
84 |
+
if vector1 == 0:
|
85 |
+
vector1 = [0.025515518153991442 for _ in range(1536)]
|
86 |
+
elif vector1 == 1:
|
87 |
+
vector1 = table[str(list(table.keys())[0])]
|
88 |
+
|
89 |
+
for key in table.keys():
|
90 |
+
lin_dist_to_1[key] = np.linalg.norm(np.array(vector1) - np.array(table[key]))
|
91 |
+
|
92 |
+
my_1_writer.safe_my_dict_as_json(analysis_filename, lin_dist_to_1)
|
93 |
+
print("lin dist to 1 saved")
|
94 |
+
|
95 |
+
|
96 |
+
def manhattan_dist(database, vector1=0, analysis_filename=0):
|
97 |
+
with open(database, "r") as f:
|
98 |
+
table = json.load(f)
|
99 |
+
manhattan_dist_to_1 = {}
|
100 |
+
|
101 |
+
if vector1 == 0:
|
102 |
+
vector1 = [0.025515518153991442 for _ in range(1536)]
|
103 |
+
elif vector1 == 1:
|
104 |
+
vector1 = table[str(list(table.keys())[0])]
|
105 |
+
|
106 |
+
for key in table.keys():
|
107 |
+
manhattan_dist_to_1[key] = sum(np.array(vector1) - np.array(table[key]))
|
108 |
+
|
109 |
+
my_1_writer.safe_my_dict_as_json(analysis_filename, manhattan_dist_to_1)
|
110 |
+
print("manhattan dist to 1 saved")
|
111 |
+
|
112 |
+
|
113 |
+
#vec_table
|
114 |
+
def sim_search_fly(vec_table, term, debug=False):
|
115 |
+
if debug:
|
116 |
+
print(type(vec_table))
|
117 |
+
print(type(term))
|
118 |
+
print(type(vec_table[list(vec_table.keys())[0]]))
|
119 |
+
print("vec table:")
|
120 |
+
print(vec_table[list(vec_table.keys())[5]][:4])
|
121 |
+
print("search term")
|
122 |
+
print(term[:4])
|
123 |
+
if type(term) == str:
|
124 |
+
print("str")
|
125 |
+
vector1 = my_new_openai.vectorize_data(term)
|
126 |
+
elif type(term) == list:
|
127 |
+
print("list")
|
128 |
+
vector1 = term
|
129 |
+
else:
|
130 |
+
print("invalid search_term/search_vector format")
|
131 |
+
return
|
132 |
+
|
133 |
+
sim_search_dict = {}
|
134 |
+
for key in vec_table.keys():
|
135 |
+
vector2 = vec_table[key]
|
136 |
+
if debug:
|
137 |
+
print("")
|
138 |
+
print(f"{vector1}")
|
139 |
+
print(f"{vector2}")
|
140 |
+
print(f"doing dot product for {key} and {term}")
|
141 |
+
if vector2[0] == vector2[1] and vector2[3] == vector2[4] and vector2[5] == vector2[6]:
|
142 |
+
dp = 200
|
143 |
+
else:
|
144 |
+
dp = np.dot(vector1, vector2)
|
145 |
+
#distance = np.linalg.norm(np.array(vector1) - np.array(vector2))
|
146 |
+
if debug:
|
147 |
+
print(f"the dp is {dp}")
|
148 |
+
#print(f"the distance is{distance}")
|
149 |
+
print("")
|
150 |
+
print("")
|
151 |
+
print("")
|
152 |
+
sim_search_dict[key] = dp #* distance
|
153 |
+
|
154 |
+
# sort with the biggest similarity
|
155 |
+
sorted_table = dict(sorted(sim_search_dict.items(), key=lambda item: item[1]), reversed=True)
|
156 |
+
|
157 |
+
if debug:
|
158 |
+
for key, value in sorted_table[:5]:
|
159 |
+
print(f"{key}: {value}")
|
160 |
+
|
161 |
+
# first_key, first_value = list(sortedTable.items())[0]
|
162 |
+
print(f"the closest word to your input is: {list(sorted_table.keys())[0]}")
|
163 |
+
return sorted_table
|
my_new_openai.py → util/my_new_openai.py
RENAMED
@@ -1,151 +1,151 @@
|
|
1 |
-
import os
|
2 |
-
from openai import OpenAI
|
3 |
-
import requests
|
4 |
-
import base64
|
5 |
-
|
6 |
-
client = OpenAI()
|
7 |
-
|
8 |
-
|
9 |
-
def image_bytes_to_base64(image_bytes):
|
10 |
-
"""
|
11 |
-
Converts an image from bytes to a Base64 encoded string.
|
12 |
-
|
13 |
-
Args:
|
14 |
-
image_bytes (bytes): Byte content of the image.
|
15 |
-
|
16 |
-
Returns:
|
17 |
-
str: A Base64 encoded string of the image.
|
18 |
-
"""
|
19 |
-
return base64.b64encode(image_bytes).decode('utf-8')
|
20 |
-
|
21 |
-
|
22 |
-
def image_to_base64(image_path):
|
23 |
-
with open(image_path, "rb") as image_file:
|
24 |
-
return str(base64.b64encode(image_file.read()).decode('utf-8'))
|
25 |
-
|
26 |
-
|
27 |
-
def gpt4_new(prompt_text):
|
28 |
-
gpt_response = client.chat.completions.create(
|
29 |
-
model="gpt-4",
|
30 |
-
messages=[{"role": "system",
|
31 |
-
"content": "Du bist eine Maschine, die Dokumente klassifiziert."},
|
32 |
-
{"role": "user", "content": prompt_text}])
|
33 |
-
return gpt_response.choices[0].message.content
|
34 |
-
|
35 |
-
|
36 |
-
def vectorize_data(data_input):
|
37 |
-
# input can be list or string:
|
38 |
-
|
39 |
-
if isinstance(data_input, list):
|
40 |
-
# returning a dictionary
|
41 |
-
my_dict = {}
|
42 |
-
for item in data_input:
|
43 |
-
my_dict[str(item)] = client.embeddings.create(input=data_input,
|
44 |
-
model="text-embedding-ada-002").data[0].embedding
|
45 |
-
return my_dict
|
46 |
-
|
47 |
-
elif isinstance(data_input, str):
|
48 |
-
# returning just the vector
|
49 |
-
return client.embeddings.create(input=data_input, model="text-embedding-ada-002").data[0].embedding
|
50 |
-
|
51 |
-
else:
|
52 |
-
print("none")
|
53 |
-
|
54 |
-
|
55 |
-
def img_create(prompt="a nice house on the beach", download_path=""):
|
56 |
-
# to open, must download
|
57 |
-
my_url = client.images.generate(model="dall-e-3", prompt=prompt, size="1024x1024").data[0].url
|
58 |
-
if download_path:
|
59 |
-
my_image = requests.get(my_url)
|
60 |
-
if my_image.status_code == 200:
|
61 |
-
with open(download_path, 'wb') as f:
|
62 |
-
f.write(my_image.content)
|
63 |
-
else:
|
64 |
-
print("Failed to retrieve image")
|
65 |
-
return my_url
|
66 |
-
|
67 |
-
|
68 |
-
def img_to_text(img_url="", img_base64="", prompt="What’s in this image?", print_out=True):
|
69 |
-
if img_url:
|
70 |
-
img_desc_response = client.chat.completions.create(
|
71 |
-
model="gpt-4-turbo",
|
72 |
-
messages=[
|
73 |
-
{
|
74 |
-
"role": "user",
|
75 |
-
"content": [
|
76 |
-
{"type": "text", "text": prompt},
|
77 |
-
{
|
78 |
-
"type": "image_url",
|
79 |
-
"image_url": {
|
80 |
-
"url": img_url,
|
81 |
-
},
|
82 |
-
},
|
83 |
-
],
|
84 |
-
}
|
85 |
-
],
|
86 |
-
max_tokens=500,
|
87 |
-
)
|
88 |
-
if print_out:
|
89 |
-
print(img_desc_response.choices[0].message.content)
|
90 |
-
return img_desc_response.choices[0].message.content
|
91 |
-
elif img_base64:
|
92 |
-
headers = {
|
93 |
-
"Content-Type": "application/json",
|
94 |
-
"Authorization": f"Bearer {os.environ['OPENAI_API_KEY']}"
|
95 |
-
}
|
96 |
-
payload = {
|
97 |
-
"model": "gpt-4-turbo",
|
98 |
-
"messages": [
|
99 |
-
{
|
100 |
-
"role": "user",
|
101 |
-
"content": [
|
102 |
-
{
|
103 |
-
"type": "text",
|
104 |
-
"text": prompt
|
105 |
-
},
|
106 |
-
{
|
107 |
-
"type": "image_url",
|
108 |
-
"image_url": {
|
109 |
-
"url": f"data:image/jpeg;base64,{img_base64}"
|
110 |
-
}
|
111 |
-
}
|
112 |
-
]
|
113 |
-
}
|
114 |
-
],
|
115 |
-
"max_tokens": 300
|
116 |
-
}
|
117 |
-
img_desc_response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
|
118 |
-
if print_out:
|
119 |
-
print(img_desc_response.json()["choices"][0]["message"]["content"])
|
120 |
-
return img_desc_response.json()["choices"][0]["message"]["content"]
|
121 |
-
else:
|
122 |
-
return ValueError
|
123 |
-
|
124 |
-
|
125 |
-
def encode_image_to_base64(image_path):
|
126 |
-
with open(image_path, "rb") as image_file:
|
127 |
-
encoded_string = base64.b64encode(image_file.read()).decode('utf-8')
|
128 |
-
return encoded_string
|
129 |
-
|
130 |
-
|
131 |
-
def table_to_text(table=None, prompt="describe this table in plain text. "
|
132 |
-
"be as precise as possible. spare no detail. "
|
133 |
-
"what is in this table?", print_out=True):
|
134 |
-
if table is not None:
|
135 |
-
response = gpt4_new(f"{prompt} TABLE: {table}")
|
136 |
-
if print_out:
|
137 |
-
print(response)
|
138 |
-
return response
|
139 |
-
else:
|
140 |
-
return ValueError
|
141 |
-
|
142 |
-
|
143 |
-
if __name__ == "__main__":
|
144 |
-
#print("here are all functions that directly call openai.")
|
145 |
-
#img_create("a skier in the swiss alps", download_path="skier.png")
|
146 |
-
#img_to_text(img_base64=encode_image_to_base64("skier.png"))
|
147 |
-
#print(image_to_base64("skier.png"))
|
148 |
-
#print(vectorize_data("test string"))
|
149 |
-
|
150 |
-
print(gpt4_new())
|
151 |
-
|
|
|
1 |
+
import os
|
2 |
+
from openai import OpenAI
|
3 |
+
import requests
|
4 |
+
import base64
|
5 |
+
|
6 |
+
client = OpenAI()
|
7 |
+
|
8 |
+
|
9 |
+
def image_bytes_to_base64(image_bytes):
|
10 |
+
"""
|
11 |
+
Converts an image from bytes to a Base64 encoded string.
|
12 |
+
|
13 |
+
Args:
|
14 |
+
image_bytes (bytes): Byte content of the image.
|
15 |
+
|
16 |
+
Returns:
|
17 |
+
str: A Base64 encoded string of the image.
|
18 |
+
"""
|
19 |
+
return base64.b64encode(image_bytes).decode('utf-8')
|
20 |
+
|
21 |
+
|
22 |
+
def image_to_base64(image_path):
|
23 |
+
with open(image_path, "rb") as image_file:
|
24 |
+
return str(base64.b64encode(image_file.read()).decode('utf-8'))
|
25 |
+
|
26 |
+
|
27 |
+
def gpt4_new(prompt_text):
|
28 |
+
gpt_response = client.chat.completions.create(
|
29 |
+
model="gpt-4",
|
30 |
+
messages=[{"role": "system",
|
31 |
+
"content": "Du bist eine Maschine, die Dokumente klassifiziert."},
|
32 |
+
{"role": "user", "content": prompt_text}])
|
33 |
+
return gpt_response.choices[0].message.content
|
34 |
+
|
35 |
+
|
36 |
+
def vectorize_data(data_input):
|
37 |
+
# input can be list or string:
|
38 |
+
|
39 |
+
if isinstance(data_input, list):
|
40 |
+
# returning a dictionary
|
41 |
+
my_dict = {}
|
42 |
+
for item in data_input:
|
43 |
+
my_dict[str(item)] = client.embeddings.create(input=data_input,
|
44 |
+
model="text-embedding-ada-002").data[0].embedding
|
45 |
+
return my_dict
|
46 |
+
|
47 |
+
elif isinstance(data_input, str):
|
48 |
+
# returning just the vector
|
49 |
+
return client.embeddings.create(input=data_input, model="text-embedding-ada-002").data[0].embedding
|
50 |
+
|
51 |
+
else:
|
52 |
+
print("none")
|
53 |
+
|
54 |
+
|
55 |
+
def img_create(prompt="a nice house on the beach", download_path=""):
|
56 |
+
# to open, must download
|
57 |
+
my_url = client.images.generate(model="dall-e-3", prompt=prompt, size="1024x1024").data[0].url
|
58 |
+
if download_path:
|
59 |
+
my_image = requests.get(my_url)
|
60 |
+
if my_image.status_code == 200:
|
61 |
+
with open(download_path, 'wb') as f:
|
62 |
+
f.write(my_image.content)
|
63 |
+
else:
|
64 |
+
print("Failed to retrieve image")
|
65 |
+
return my_url
|
66 |
+
|
67 |
+
|
68 |
+
def img_to_text(img_url="", img_base64="", prompt="What’s in this image?", print_out=True):
|
69 |
+
if img_url:
|
70 |
+
img_desc_response = client.chat.completions.create(
|
71 |
+
model="gpt-4-turbo",
|
72 |
+
messages=[
|
73 |
+
{
|
74 |
+
"role": "user",
|
75 |
+
"content": [
|
76 |
+
{"type": "text", "text": prompt},
|
77 |
+
{
|
78 |
+
"type": "image_url",
|
79 |
+
"image_url": {
|
80 |
+
"url": img_url,
|
81 |
+
},
|
82 |
+
},
|
83 |
+
],
|
84 |
+
}
|
85 |
+
],
|
86 |
+
max_tokens=500,
|
87 |
+
)
|
88 |
+
if print_out:
|
89 |
+
print(img_desc_response.choices[0].message.content)
|
90 |
+
return img_desc_response.choices[0].message.content
|
91 |
+
elif img_base64:
|
92 |
+
headers = {
|
93 |
+
"Content-Type": "application/json",
|
94 |
+
"Authorization": f"Bearer {os.environ['OPENAI_API_KEY']}"
|
95 |
+
}
|
96 |
+
payload = {
|
97 |
+
"model": "gpt-4-turbo",
|
98 |
+
"messages": [
|
99 |
+
{
|
100 |
+
"role": "user",
|
101 |
+
"content": [
|
102 |
+
{
|
103 |
+
"type": "text",
|
104 |
+
"text": prompt
|
105 |
+
},
|
106 |
+
{
|
107 |
+
"type": "image_url",
|
108 |
+
"image_url": {
|
109 |
+
"url": f"data:image/jpeg;base64,{img_base64}"
|
110 |
+
}
|
111 |
+
}
|
112 |
+
]
|
113 |
+
}
|
114 |
+
],
|
115 |
+
"max_tokens": 300
|
116 |
+
}
|
117 |
+
img_desc_response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
|
118 |
+
if print_out:
|
119 |
+
print(img_desc_response.json()["choices"][0]["message"]["content"])
|
120 |
+
return img_desc_response.json()["choices"][0]["message"]["content"]
|
121 |
+
else:
|
122 |
+
return ValueError
|
123 |
+
|
124 |
+
|
125 |
+
def encode_image_to_base64(image_path):
|
126 |
+
with open(image_path, "rb") as image_file:
|
127 |
+
encoded_string = base64.b64encode(image_file.read()).decode('utf-8')
|
128 |
+
return encoded_string
|
129 |
+
|
130 |
+
|
131 |
+
def table_to_text(table=None, prompt="describe this table in plain text. "
|
132 |
+
"be as precise as possible. spare no detail. "
|
133 |
+
"what is in this table?", print_out=True):
|
134 |
+
if table is not None:
|
135 |
+
response = gpt4_new(f"{prompt} TABLE: {table}")
|
136 |
+
if print_out:
|
137 |
+
print(response)
|
138 |
+
return response
|
139 |
+
else:
|
140 |
+
return ValueError
|
141 |
+
|
142 |
+
|
143 |
+
if __name__ == "__main__":
|
144 |
+
#print("here are all functions that directly call openai.")
|
145 |
+
#img_create("a skier in the swiss alps", download_path="skier.png")
|
146 |
+
#img_to_text(img_base64=encode_image_to_base64("skier.png"))
|
147 |
+
#print(image_to_base64("skier.png"))
|
148 |
+
#print(vectorize_data("test string"))
|
149 |
+
|
150 |
+
print(gpt4_new())
|
151 |
+
|
my_vectors.py → util/my_vectors.py
RENAMED
File without changes
|
setup_db.py → util/setup_db.py
RENAMED
@@ -1,50 +1,50 @@
|
|
1 |
-
import time
|
2 |
-
import openpyxl
|
3 |
-
import my_new_openai
|
4 |
-
|
5 |
-
|
6 |
-
def update_excel_with_sums(filename):
|
7 |
-
# Load the workbook and select the active worksheet
|
8 |
-
workbook = openpyxl.load_workbook(filename)
|
9 |
-
sheet = workbook.active
|
10 |
-
|
11 |
-
# Iterate through each row in the sheet
|
12 |
-
for row in sheet.iter_rows(min_row=1, min_col=2, max_col=3):
|
13 |
-
Bn, Cn = row # Assuming B and C are columns 2 and 3 respectively
|
14 |
-
vector = my_new_openai.vectorize_data(f"{Bn.value}: {Cn.value}") if Bn.value and Cn.value else 0
|
15 |
-
if vector != 0:
|
16 |
-
for val in vector:
|
17 |
-
sheet.cell(row=Bn.row, column=4+vector.index(val)).value = val
|
18 |
-
|
19 |
-
# Save the workbook
|
20 |
-
workbook.save(filename)
|
21 |
-
print(f"Updated the file '{filename}' with vectors in column D.")
|
22 |
-
|
23 |
-
|
24 |
-
def load_vectorstore_from_excel(filename):
|
25 |
-
# returns a dictonary
|
26 |
-
# Load the workbook and select the active worksheet
|
27 |
-
workbook = openpyxl.load_workbook(filename)
|
28 |
-
sheet = workbook.active
|
29 |
-
|
30 |
-
# Iterate through each row in the sheet
|
31 |
-
vec_store = {}
|
32 |
-
for row in range(3, 634):
|
33 |
-
vec = []
|
34 |
-
for col in range(0, 1536):
|
35 |
-
val = sheet.cell(row=row, column=4+col).value
|
36 |
-
vec.append(val)
|
37 |
-
vec_store[str(sheet.cell(row=row, column=1).value)] = vec
|
38 |
-
return vec_store
|
39 |
-
|
40 |
-
|
41 |
-
if __name__ == '__main__':
|
42 |
-
#update_excel_with_sums("KBOB_Klassifizierung.xlsx")
|
43 |
-
t = time.time()
|
44 |
-
|
45 |
-
vec_store = load_vectorstore_from_excel("KBOB_Klassifizierung.xlsx")
|
46 |
-
|
47 |
-
print(time.time()-t)
|
48 |
-
for e in vec_store.keys():
|
49 |
-
print(f"{e}: {vec_store[e][0]}, {vec_store[e][1]}, .... {vec_store[e][-1]}")
|
50 |
-
|
|
|
1 |
+
import time
|
2 |
+
import openpyxl
|
3 |
+
import my_new_openai
|
4 |
+
|
5 |
+
|
6 |
+
def update_excel_with_sums(filename):
|
7 |
+
# Load the workbook and select the active worksheet
|
8 |
+
workbook = openpyxl.load_workbook(filename)
|
9 |
+
sheet = workbook.active
|
10 |
+
|
11 |
+
# Iterate through each row in the sheet
|
12 |
+
for row in sheet.iter_rows(min_row=1, min_col=2, max_col=3):
|
13 |
+
Bn, Cn = row # Assuming B and C are columns 2 and 3 respectively
|
14 |
+
vector = my_new_openai.vectorize_data(f"{Bn.value}: {Cn.value}") if Bn.value and Cn.value else 0
|
15 |
+
if vector != 0:
|
16 |
+
for val in vector:
|
17 |
+
sheet.cell(row=Bn.row, column=4+vector.index(val)).value = val
|
18 |
+
|
19 |
+
# Save the workbook
|
20 |
+
workbook.save(filename)
|
21 |
+
print(f"Updated the file '{filename}' with vectors in column D.")
|
22 |
+
|
23 |
+
|
24 |
+
def load_vectorstore_from_excel(filename):
|
25 |
+
# returns a dictonary
|
26 |
+
# Load the workbook and select the active worksheet
|
27 |
+
workbook = openpyxl.load_workbook(filename)
|
28 |
+
sheet = workbook.active
|
29 |
+
|
30 |
+
# Iterate through each row in the sheet
|
31 |
+
vec_store = {}
|
32 |
+
for row in range(3, 634):
|
33 |
+
vec = []
|
34 |
+
for col in range(0, 1536):
|
35 |
+
val = sheet.cell(row=row, column=4+col).value
|
36 |
+
vec.append(val)
|
37 |
+
vec_store[str(sheet.cell(row=row, column=1).value)] = vec
|
38 |
+
return vec_store
|
39 |
+
|
40 |
+
|
41 |
+
if __name__ == '__main__':
|
42 |
+
#update_excel_with_sums("KBOB_Klassifizierung.xlsx")
|
43 |
+
t = time.time()
|
44 |
+
|
45 |
+
vec_store = load_vectorstore_from_excel("../data/KBOB_Klassifizierung.xlsx")
|
46 |
+
|
47 |
+
print(time.time()-t)
|
48 |
+
for e in vec_store.keys():
|
49 |
+
print(f"{e}: {vec_store[e][0]}, {vec_store[e][1]}, .... {vec_store[e][-1]}")
|
50 |
+
|