csalabs commited on
Commit
66adac7
1 Parent(s): 036ce5b

Upload 17 files

Browse files
Files changed (17) hide show
  1. .dockerignore +4 -0
  2. .editorconfig +17 -0
  3. .flake8 +4 -0
  4. .pre-commit-config.yaml +49 -0
  5. .pyup.yml +17 -0
  6. ACKNOWLEDGEMENT.md +10 -0
  7. CONTRIBUTING.md +47 -0
  8. Dockerfile +21 -0
  9. LICENSE +201 -0
  10. README.md +287 -13
  11. constants.py +142 -0
  12. ingest.py +161 -0
  13. localGPT_UI.py +119 -0
  14. pyproject.toml +15 -0
  15. requirements.txt +32 -0
  16. run_localGPT.py +247 -0
  17. run_localGPT_API.py +173 -0
.dockerignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ *
2
+ !*.py
3
+ !requirements.txt
4
+ !SOURCE_DOCUMENTS
.editorconfig ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # http://editorconfig.org
2
+
3
+ root = true
4
+
5
+ [*]
6
+ charset = utf-8
7
+ end_of_line = lf
8
+ insert_final_newline = true
9
+ trim_trailing_whitespace = true
10
+
11
+ [*.{py,rst,ini}]
12
+ indent_style = space
13
+ indent_size = 4
14
+
15
+ [*.{html,css,scss,json,yml,xml}]
16
+ indent_style = space
17
+ indent_size = 2
.flake8 ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ [flake8]
2
+ exclude = docs
3
+ max-line-length = 119
4
+ extend-ignore = E203
.pre-commit-config.yaml ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ default_stages: [commit]
2
+
3
+ repos:
4
+ - repo: https://github.com/pre-commit/pre-commit-hooks
5
+ rev: v4.4.0
6
+ hooks:
7
+ - id: trailing-whitespace
8
+ - id: end-of-file-fixer
9
+ - id: check-json
10
+ - id: check-toml
11
+ - id: check-xml
12
+ - id: check-yaml
13
+ - id: debug-statements
14
+ - id: check-builtin-literals
15
+ - id: check-case-conflict
16
+ - id: detect-private-key
17
+
18
+ - repo: https://github.com/pre-commit/mirrors-prettier
19
+ rev: "v3.0.0-alpha.9-for-vscode"
20
+ hooks:
21
+ - id: prettier
22
+ args: ["--tab-width", "2"]
23
+
24
+ - repo: https://github.com/asottile/pyupgrade
25
+ rev: v3.4.0
26
+ hooks:
27
+ - id: pyupgrade
28
+ args: [--py311-plus]
29
+ exclude: hooks/
30
+
31
+ - repo: https://github.com/psf/black
32
+ rev: 23.3.0
33
+ hooks:
34
+ - id: black
35
+
36
+ - repo: https://github.com/PyCQA/isort
37
+ rev: 5.12.0
38
+ hooks:
39
+ - id: isort
40
+
41
+ - repo: https://github.com/PyCQA/flake8
42
+ rev: 6.0.0
43
+ hooks:
44
+ - id: flake8
45
+
46
+ ci:
47
+ autoupdate_schedule: weekly
48
+ skip: []
49
+ submodules: false
.pyup.yml ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # configure updates globally
2
+ # default: all
3
+ # allowed: all, insecure, False
4
+ update: all
5
+
6
+ # configure dependency pinning globally
7
+ # default: True
8
+ # allowed: True, False
9
+ pin: True
10
+
11
+ # add a label to pull requests, default is not set
12
+ # requires private repo permissions, even on public repos
13
+ # default: empty
14
+ label_prs: update
15
+
16
+ requirements:
17
+ - "requirements.txt"
ACKNOWLEDGEMENT.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # Acknowledgments
2
+
3
+ Some code was taken or inspired from other projects:-
4
+
5
+ - [CookieCutter Django][cookiecutter-django]
6
+ - `pre-commit-config.yaml` is taken from there with almost no changes
7
+ - `github-actions.yml` is inspired by `gitlab-ci.yml`
8
+ - `.pyup.yml`, `.flake8`, `.editorconfig`, `pyproject.toml` are taken from there with minor changes,
9
+
10
+ [cookiecutter-django]: https://github.com/cookiecutter/cookiecutter-django
CONTRIBUTING.md ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # How to Contribute
2
+
3
+ Always happy to get issues identified and pull requests!
4
+
5
+ ## General considerations
6
+
7
+ 1. Keep it small. The smaller the change, the more likely we are to accept.
8
+ 2. Changes that fix a current issue get priority for review.
9
+ 3. Check out [GitHub guide][submit-a-pr] if you've never created a pull request before.
10
+
11
+ ## Getting started
12
+
13
+ 1. Fork the repo
14
+ 2. Clone your fork
15
+ 3. Create a branch for your changes
16
+
17
+ This last step is very important, don't start developing from master, it'll cause pain if you need to send another change later.
18
+
19
+ TIP: If you're working on a GitHub issue, name your branch after the issue number, e.g. `issue-123-<ISSUE-NAME>`. This will help us keep track of what you're working on. If there is not an issue for what you're working on, create one first please. Someone else might be working on the same thing, or we might have a reason for not wanting to do it.
20
+
21
+ ## Pre-commit
22
+
23
+ GitHub Actions is going to run Pre-commit hooks on your PR. If the hooks fail, you will need to fix them before your PR can be merged. It will save you a lot of time if you run the hooks locally before you push your changes. To do that, you need to install pre-commit on your local machine.
24
+
25
+ ```shell
26
+ pip install pre-commit
27
+ ```
28
+
29
+ Once installed, you need to add the pre-commit hooks to your local repo.
30
+
31
+ ```shell
32
+ pre-commit install
33
+ ```
34
+
35
+ Now, every time you commit, the hooks will run and check your code. If they fail, you will need to fix them before you can commit.
36
+
37
+ If it happened that you committed changes already without having pre-commit hooks and do not want to reset and recommit again, you can run the following command to run the hooks on your local repo.
38
+
39
+ ```shell
40
+ pre-commit run --all-files
41
+ ```
42
+
43
+ ## Help Us Improve This Documentation
44
+
45
+ If you find that something is missing or have suggestions for improvements, please submit a PR.
46
+
47
+ [submit-a-pr]: https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request
Dockerfile ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # syntax=docker/dockerfile:1
2
+ # Build as `docker build . -t localgpt`, requires BuildKit.
3
+ # Run as `docker run -it --mount src="$HOME/.cache",target=/root/.cache,type=bind --gpus=all localgpt`, requires Nvidia container toolkit.
4
+
5
+ FROM nvidia/cuda:11.7.1-runtime-ubuntu22.04
6
+ RUN apt-get update && apt-get install -y software-properties-common
7
+ RUN apt-get install -y g++-11 make python3 python-is-python3 pip
8
+ # only copy what's needed at every step to optimize layer cache
9
+ COPY ./requirements.txt .
10
+ # use BuildKit cache mount to drastically reduce redownloading from pip on repeated builds
11
+ RUN --mount=type=cache,target=/root/.cache CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install --timeout 100 -r requirements.txt
12
+ COPY SOURCE_DOCUMENTS ./SOURCE_DOCUMENTS
13
+ COPY ingest.py constants.py ./
14
+ # Docker BuildKit does not support GPU during *docker build* time right now, only during *docker run*.
15
+ # See <https://github.com/moby/buildkit/issues/1436>.
16
+ # If this changes in the future you can `docker build --build-arg device_type=cuda . -t localgpt` (+GPU argument to be determined).
17
+ ARG device_type=cpu
18
+ RUN --mount=type=cache,target=/root/.cache python ingest.py --device_type $device_type
19
+ COPY . .
20
+ ENV device_type=cuda
21
+ CMD python run_localGPT.py --device_type $device_type
LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
README.md CHANGED
@@ -1,13 +1,287 @@
1
- ---
2
- title: AI EMBD
3
- emoji: 🐠
4
- colorFrom: indigo
5
- colorTo: green
6
- sdk: gradio
7
- sdk_version: 3.44.2
8
- app_file: app.py
9
- pinned: false
10
- license: llama2
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # localGPT
2
+
3
+ This project was inspired by the original [privateGPT](https://github.com/imartinez/privateGPT). Most of the description here is inspired by the original privateGPT.
4
+
5
+ For detailed overview of the project, Watch these videos
6
+ - [Detailed code-walkthrough](https://youtu.be/MlyoObdIHyo).
7
+ - [Llama-2 with LocalGPT](https://youtu.be/lbFmceo4D5E)
8
+ - [Adding Chat History](https://youtu.be/d7otIM_MCZs)
9
+
10
+ In this model, I have replaced the GPT4ALL model with Vicuna-7B model and we are using the InstructorEmbeddings instead of LlamaEmbeddings as used in the original privateGPT. Both Embeddings as well as LLM will run on GPU instead of CPU. It also has CPU support if you do not have a GPU (see below for instruction).
11
+
12
+ Ask questions to your documents without an internet connection, using the power of LLMs. 100% private, no data leaves your execution environment at any point. You can ingest documents and ask questions without an internet connection!
13
+
14
+ Built with [LangChain](https://github.com/hwchase17/langchain) and [Vicuna-7B](https://huggingface.co/TheBloke/vicuna-7B-1.1-HF) (+ alot more!) and [InstructorEmbeddings](https://instructor-embedding.github.io/)
15
+
16
+ # Environment Setup
17
+
18
+ Install conda
19
+
20
+ ```shell
21
+ conda create -n localGPT
22
+ ```
23
+
24
+ Activate
25
+
26
+ ```shell
27
+ conda activate localGPT
28
+ ```
29
+
30
+ In order to set your environment up to run the code here, first install all requirements:
31
+
32
+ ```shell
33
+ pip install -r requirements.txt
34
+ ```
35
+
36
+
37
+ If you want to use BLAS or Metal with [llama-cpp](<(https://github.com/abetlen/llama-cpp-python#installation-with-openblas--cublas--clblast--metal)>) you can set appropriate flags:
38
+
39
+ ```shell
40
+ # Example: cuBLAS
41
+ CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install -r requirements.txt
42
+ ```
43
+
44
+ ## Docker
45
+
46
+ Installing the required packages for GPU inference on Nvidia GPUs, like gcc 11 and CUDA 11, may cause conflicts with other packages in your system.
47
+ As an alternative to Conda, you can use Docker with the provided Dockerfile.
48
+ It includes CUDA, your system just needs Docker, BuildKit, your Nvidia GPU driver and the Nvidia container toolkit.
49
+ Build as `docker build . -t localgpt`, requires BuildKit.
50
+ Docker BuildKit does not support GPU during *docker build* time right now, only during *docker run*.
51
+ Run as `docker run -it --mount src="$HOME/.cache",target=/root/.cache,type=bind --gpus=all localgpt`.
52
+
53
+ ## Test dataset
54
+
55
+ This repo uses a 4 PDF of Ontorio Rule Book as an example.
56
+
57
+ ## Instructions for ingesting your own dataset
58
+
59
+ Put any and all of your .txt, .pdf, or .csv files into the SOURCE_DOCUMENTS directory
60
+ in the load_documents() function, replace the docs_path with the absolute path of your source_documents directory.
61
+
62
+ The current default file types are .txt, .pdf, .csv, and .xlsx, if you want to use any other file type, you will need to convert it to one of the default file types.
63
+
64
+ Run the following command to ingest all the data.
65
+
66
+ `defaults to cuda`
67
+
68
+ ```shell
69
+ python ingest.py
70
+ ```
71
+
72
+ Use the device type argument to specify a given device.
73
+
74
+ ```sh
75
+ python ingest.py --device_type cpu
76
+ ```
77
+
78
+ Use help for a full list of supported devices.
79
+
80
+ ```sh
81
+ python ingest.py --help
82
+ ```
83
+
84
+ It will create an index containing the local vectorstore. Will take time, depending on the size of your documents.
85
+ You can ingest as many documents as you want, and all will be accumulated in the local embeddings database.
86
+ If you want to start from an empty database, delete the `index`.
87
+
88
+ Note: When you run this for the first time, it will download take time as it has to download the embedding model. In the subseqeunt runs, no data will leave your local enviroment and can be run without internet connection.
89
+
90
+ ## Ask questions to your documents, locally!
91
+
92
+ In order to ask a question, run a command like:
93
+
94
+ ```shell
95
+ python run_localGPT.py
96
+ ```
97
+
98
+ And wait for the script to require your input.
99
+
100
+ ```shell
101
+ > Enter a query:
102
+ ```
103
+
104
+ Hit enter. Wait while the LLM model consumes the prompt and prepares the answer. Once done, it will print the answer and the 4 sources it used as context from your documents; you can then ask another question without re-running the script, just wait for the prompt again.
105
+
106
+ Note: When you run this for the first time, it will need internet connection to download the vicuna-7B model. After that you can turn off your internet connection, and the script inference would still work. No data gets out of your local environment.
107
+
108
+ Type `exit` to finish the script.
109
+
110
+ # Run it on CPU
111
+
112
+ By default, localGPT will use your GPU to run both the `ingest.py` and `run_localGPT.py` scripts. But if you do not have a GPU and want to run this on CPU, now you can do that (Warning: Its going to be slow!). You will need to use `--device_type cpu`flag with both scripts.
113
+
114
+ For Ingestion run the following:
115
+
116
+ ```shell
117
+ python ingest.py --device_type cpu
118
+ ```
119
+
120
+ In order to ask a question, run a command like:
121
+
122
+ ```shell
123
+ python run_localGPT.py --device_type cpu
124
+ ```
125
+
126
+ # Run quantized for M1/M2:
127
+
128
+ GGML quantized models for Apple Silicon (M1/M2) are supported through the llama-cpp library, [example](https://huggingface.co/TheBloke/Wizard-Vicuna-13B-Uncensored-GGML). GPTQ quantized models that leverage auto-gptq will not work, [see here](https://github.com/PanQiWei/AutoGPTQ/issues/133#issuecomment-1575002893). GGML models will work for CPU or MPS.
129
+
130
+ ## Troubleshooting
131
+
132
+ **Install MPS:**
133
+ 1- Follow this [page](https://developer.apple.com/metal/pytorch/) to build up PyTorch with Metal Performance Shaders (MPS) support. PyTorch uses the new MPS backend for GPU training acceleration. It is good practice to verify mps support using a simple Python script as mentioned in the provided link.
134
+
135
+ 2- By following the page, here is an example of what you may initiate in your terminal
136
+
137
+ ```shell
138
+ xcode-select --install
139
+ conda install pytorch torchvision torchaudio -c pytorch-nightly
140
+ pip install chardet
141
+ pip install cchardet
142
+ pip uninstall charset_normalizer
143
+ pip install charset_normalizer
144
+ pip install pdfminer.six
145
+ pip install xformers
146
+ ```
147
+
148
+ **Upgrade packages:**
149
+ Your langchain or llama-cpp version could be outdated. Upgrade your packages by running install again.
150
+
151
+ ```shell
152
+ pip install -r requirements.txt
153
+ ```
154
+
155
+ If you are still getting errors, try installing the latest llama-cpp-python with these flags, and [see thread](https://github.com/abetlen/llama-cpp-python/issues/317#issuecomment-1587962205).
156
+
157
+ ```shell
158
+ CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install -U llama-cpp-python --no-cache-dir
159
+ ```
160
+
161
+ # Run the UI
162
+
163
+ 1. Open `constants.py` in an editor of your choice and depending on choice add the LLM you want to use. By default, the following model will be used:
164
+
165
+ ```shell
166
+ MODEL_ID = "TheBloke/Llama-2-7B-Chat-GGML"
167
+ MODEL_BASENAME = "llama-2-7b-chat.ggmlv3.q4_0.bin"
168
+ ```
169
+
170
+ 3. Open up a terminal and activate your python environment that contains the dependencies installed from requirements.txt.
171
+
172
+ 4. Navigate to the `/LOCALGPT` directory.
173
+
174
+ 5. Run the following command `python run_localGPT_API.py`. The API should being to run.
175
+
176
+ 6. Wait until everything has loaded in. You should see something like `INFO:werkzeug:Press CTRL+C to quit`.
177
+
178
+ 7. Open up a second terminal and activate the same python environment.
179
+
180
+ 8. Navigate to the `/LOCALGPT/localGPTUI` directory.
181
+
182
+ 9. Run the command `python localGPTUI.py`.
183
+
184
+ 10. Open up a web browser and go the address `http://localhost:5111/`.
185
+
186
+ # How does it work?
187
+
188
+ Selecting the right local models and the power of `LangChain` you can run the entire pipeline locally, without any data leaving your environment, and with reasonable performance.
189
+
190
+ - `ingest.py` uses `LangChain` tools to parse the document and create embeddings locally using `InstructorEmbeddings`. It then stores the result in a local vector database using `Chroma` vector store.
191
+ - `run_localGPT.py` uses a local LLM to understand questions and create answers. The context for the answers is extracted from the local vector store using a similarity search to locate the right piece of context from the docs.
192
+ - You can replace this local LLM with any other LLM from the HuggingFace. Make sure whatever LLM you select is in the HF format.
193
+
194
+ # How to select different LLM models?
195
+
196
+ The following will provide instructions on how you can select a different LLM model to create your response:
197
+
198
+ 1. Open up `constants.py` in the editor of your choice.
199
+ 2. Change the `MODEL_ID` and `MODEL_BASENAME`. If you are using a quantized model (`GGML`, `GPTQ`), you will need to provide `MODEL_BASENAME`. For unquatized models, set `MODEL_BASENAME` to `NONE`
200
+ 5. There are a number of example models from HuggingFace that have already been tested to be run with the original trained model (ending with HF or have a .bin in its "Files and versions"), and quantized models (ending with GPTQ or have a .no-act-order or .safetensors in its "Files and versions").
201
+ 6. For models that end with HF or have a .bin inside its "Files and versions" on its HuggingFace page.
202
+
203
+ - Make sure you have a model_id selected. For example -> `MODEL_ID = "TheBloke/guanaco-7B-HF"`
204
+ - If you go to its HuggingFace [repo](https://huggingface.co/TheBloke/guanaco-7B-HF) and go to "Files and versions" you will notice model files that end with a .bin extension.
205
+ - Any model files that contain .bin extensions will be run with the following code where the `# load the LLM for generating Natural Language responses` comment is found.
206
+ - `MODEL_ID = "TheBloke/guanaco-7B-HF"`
207
+
208
+ 7. For models that contain GPTQ in its name and or have a .no-act-order or .safetensors extension inside its "Files and versions on its HuggingFace page.
209
+
210
+ - Make sure you have a model_id selected. For example -> model_id = `"TheBloke/wizardLM-7B-GPTQ"`
211
+ - You will also need its model basename file selected. For example -> `model_basename = "wizardLM-7B-GPTQ-4bit.compat.no-act-order.safetensors"`
212
+ - If you go to its HuggingFace [repo](https://huggingface.co/TheBloke/wizardLM-7B-GPTQ) and go to "Files and versions" you will notice a model file that ends with a .safetensors extension.
213
+ - Any model files that contain no-act-order or .safetensors extensions will be run with the following code where the `# load the LLM for generating Natural Language responses` comment is found.
214
+ - `MODEL_ID = "TheBloke/WizardLM-7B-uncensored-GPTQ"`
215
+
216
+ `MODEL_BASENAME = "WizardLM-7B-uncensored-GPTQ-4bit-128g.compat.no-act-order.safetensors"`
217
+
218
+
219
+ 8. Comment out all other instances of `MODEL_ID="other model names"`, `MODEL_BASENAME=other base model names`, and `llm = load_model(args*)`
220
+
221
+ # System Requirements
222
+
223
+ ## Python Version
224
+
225
+ To use this software, you must have Python 3.10 or later installed. Earlier versions of Python will not compile.
226
+
227
+ ## C++ Compiler
228
+
229
+ If you encounter an error while building a wheel during the `pip install` process, you may need to install a C++ compiler on your computer.
230
+
231
+ ### For Windows 10/11
232
+
233
+ To install a C++ compiler on Windows 10/11, follow these steps:
234
+
235
+ 1. Install Visual Studio 2022.
236
+ 2. Make sure the following components are selected:
237
+ - Universal Windows Platform development
238
+ - C++ CMake tools for Windows
239
+ 3. Download the MinGW installer from the [MinGW website](https://sourceforge.net/projects/mingw/).
240
+ 4. Run the installer and select the "gcc" component.
241
+
242
+ ### NVIDIA Driver's Issues:
243
+
244
+ Follow this [page](https://linuxconfig.org/how-to-install-the-nvidia-drivers-on-ubuntu-22-04) to install NVIDIA Drivers.
245
+
246
+ ## Star History
247
+
248
+ [![Star History Chart](https://api.star-history.com/svg?repos=PromtEngineer/localGPT&type=Date)](https://star-history.com/#PromtEngineer/localGPT&Date)
249
+
250
+ # Disclaimer
251
+
252
+ This is a test project to validate the feasibility of a fully local solution for question answering using LLMs and Vector embeddings. It is not production ready, and it is not meant to be used in production. Vicuna-7B is based on the Llama model so that has the original Llama license.
253
+
254
+ # Common Errors
255
+
256
+ - [Torch not compatible with CUDA enabled](https://github.com/pytorch/pytorch/issues/30664)
257
+
258
+ - Get CUDA version
259
+ ```shell
260
+ nvcc --version
261
+ ```
262
+ ```shell
263
+ nvidia-smi
264
+ ```
265
+ - Try installing PyTorch depending on your CUDA version
266
+ ```shell
267
+ conda install -c pytorch torchvision cudatoolkit=10.1 pytorch
268
+ ```
269
+ - If it doesn't work, try reinstalling
270
+ ```shell
271
+ pip uninstall torch
272
+ pip cache purge
273
+ pip install torch -f https://download.pytorch.org/whl/torch_stable.html
274
+ ```
275
+
276
+ - [ERROR: pip's dependency resolver does not currently take into account all the packages that are installed](https://stackoverflow.com/questions/72672196/error-pips-dependency-resolver-does-not-currently-take-into-account-all-the-pa/76604141#76604141)
277
+ ```shell
278
+ pip install h5py
279
+ pip install typing-extensions
280
+ pip install wheel
281
+ ```
282
+ - [Failed to import transformers](https://github.com/huggingface/transformers/issues/11262)
283
+ - Try re-install
284
+ ```shell
285
+ conda uninstall tokenizers, transformers
286
+ pip install transformers
287
+ ```
constants.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ # from dotenv import load_dotenv
4
+ from chromadb.config import Settings
5
+
6
+ # https://python.langchain.com/en/latest/modules/indexes/document_loaders/examples/excel.html?highlight=xlsx#microsoft-excel
7
+ from langchain.document_loaders import CSVLoader, PDFMinerLoader, TextLoader, UnstructuredExcelLoader, Docx2txtLoader
8
+
9
+ # load_dotenv()
10
+ ROOT_DIRECTORY = os.path.dirname(os.path.realpath(__file__))
11
+
12
+ # Define the folder for storing database
13
+ SOURCE_DIRECTORY = f"{ROOT_DIRECTORY}/SOURCE_DOCUMENTS"
14
+
15
+ PERSIST_DIRECTORY = f"{ROOT_DIRECTORY}/DB"
16
+
17
+ # Can be changed to a specific number
18
+ INGEST_THREADS = os.cpu_count() or 8
19
+
20
+ # Define the Chroma settings
21
+ CHROMA_SETTINGS = Settings(
22
+ anonymized_telemetry=False,
23
+ is_persistent=True,
24
+ )
25
+
26
+
27
+ # https://python.langchain.com/en/latest/_modules/langchain/document_loaders/excel.html#UnstructuredExcelLoader
28
+ DOCUMENT_MAP = {
29
+ ".txt": TextLoader,
30
+ ".md": TextLoader,
31
+ ".py": TextLoader,
32
+ ".pdf": PDFMinerLoader,
33
+ ".csv": CSVLoader,
34
+ ".xls": UnstructuredExcelLoader,
35
+ ".xlsx": UnstructuredExcelLoader,
36
+ ".docx": Docx2txtLoader,
37
+ ".doc": Docx2txtLoader,
38
+ }
39
+
40
+ # Default Instructor Model
41
+ EMBEDDING_MODEL_NAME = "hkunlp/instructor-large" # Uses 1.5 GB of VRAM (High Accuracy with lower VRAM usage)
42
+
43
+ ####
44
+ #### OTHER EMBEDDING MODEL OPTIONS
45
+ ####
46
+
47
+ # EMBEDDING_MODEL_NAME = "hkunlp/instructor-xl" # Uses 5 GB of VRAM (Most Accurate of all models)
48
+ # EMBEDDING_MODEL_NAME = "intfloat/e5-large-v2" # Uses 1.5 GB of VRAM (A little less accurate than instructor-large)
49
+ # EMBEDDING_MODEL_NAME = "intfloat/e5-base-v2" # Uses 0.5 GB of VRAM (A good model for lower VRAM GPUs)
50
+ # EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2" # Uses 0.2 GB of VRAM (Less accurate but fastest - only requires 150mb of vram)
51
+
52
+ ####
53
+ #### MULTILINGUAL EMBEDDING MODELS
54
+ ####
55
+
56
+ # EMBEDDING_MODEL_NAME = "intfloat/multilingual-e5-large" # Uses 2.5 GB of VRAM
57
+ # EMBEDDING_MODEL_NAME = "intfloat/multilingual-e5-base" # Uses 1.2 GB of VRAM
58
+
59
+
60
+ #### SELECT AN OPEN SOURCE LLM (LARGE LANGUAGE MODEL)
61
+ # Select the Model ID and model_basename
62
+ # load the LLM for generating Natural Language responses
63
+
64
+ #### GPU VRAM Memory required for LLM Models (ONLY) by Billion Parameter value (B Model)
65
+ #### Does not include VRAM used by Embedding Models - which use an additional 2GB-7GB of VRAM depending on the model.
66
+ ####
67
+ #### (B Model) (float32) (float16) (GPTQ 8bit) (GPTQ 4bit)
68
+ #### 7b 28 GB 14 GB 7 GB - 9 GB 3.5 GB - 5 GB
69
+ #### 13b 52 GB 26 GB 13 GB - 15 GB 6.5 GB - 8 GB
70
+ #### 32b 130 GB 65 GB 32.5 GB - 35 GB 16.25 GB - 19 GB
71
+ #### 65b 260.8 GB 130.4 GB 65.2 GB - 67 GB 32.6 GB - - 35 GB
72
+
73
+ MODEL_ID = "TheBloke/Llama-2-7B-Chat-GGML"
74
+ MODEL_BASENAME = "llama-2-7b-chat.ggmlv3.q4_0.bin"
75
+
76
+ ####
77
+ #### (FOR HF MODELS)
78
+ ####
79
+
80
+ # MODEL_ID = "TheBloke/vicuna-7B-1.1-HF"
81
+ # MODEL_BASENAME = None
82
+ # MODEL_ID = "TheBloke/Wizard-Vicuna-7B-Uncensored-HF"
83
+ # MODEL_ID = "TheBloke/guanaco-7B-HF"
84
+ # MODEL_ID = 'NousResearch/Nous-Hermes-13b' # Requires ~ 23GB VRAM. Using STransformers
85
+ # alongside will 100% create OOM on 24GB cards.
86
+ # llm = load_model(device_type, model_id=model_id)
87
+
88
+ ####
89
+ #### (FOR GPTQ QUANTIZED) Select a llm model based on your GPU and VRAM GB. Does not include Embedding Models VRAM usage.
90
+ ####
91
+
92
+ ##### 48GB VRAM Graphics Cards (RTX 6000, RTX A6000 and other 48GB VRAM GPUs) #####
93
+
94
+ ### 65b GPTQ LLM Models for 48GB GPUs (*** With best embedding model: hkunlp/instructor-xl ***)
95
+ # model_id = "TheBloke/guanaco-65B-GPTQ"
96
+ # model_basename = "model.safetensors"
97
+ # model_id = "TheBloke/Airoboros-65B-GPT4-2.0-GPTQ"
98
+ # model_basename = "model.safetensors"
99
+ # model_id = "TheBloke/gpt4-alpaca-lora_mlp-65B-GPTQ"
100
+ # model_basename = "model.safetensors"
101
+ # model_id = "TheBloke/Upstage-Llama1-65B-Instruct-GPTQ"
102
+ # model_basename = "model.safetensors"
103
+
104
+ ##### 24GB VRAM Graphics Cards (RTX 3090 - RTX 4090 (35% Faster) - RTX A5000 - RTX A5500) #####
105
+
106
+ ### 13b GPTQ Models for 24GB GPUs (*** With best embedding model: hkunlp/instructor-xl ***)
107
+ # model_id = "TheBloke/Wizard-Vicuna-13B-Uncensored-GPTQ"
108
+ # model_basename = "Wizard-Vicuna-13B-Uncensored-GPTQ-4bit-128g.compat.no-act-order.safetensors"
109
+ # model_id = "TheBloke/vicuna-13B-v1.5-GPTQ"
110
+ # model_basename = "model.safetensors"
111
+ # model_id = "TheBloke/Nous-Hermes-13B-GPTQ"
112
+ # model_basename = "nous-hermes-13b-GPTQ-4bit-128g.no-act.order"
113
+ # model_id = "TheBloke/WizardLM-13B-V1.2-GPTQ"
114
+ # model_basename = "gptq_model-4bit-128g.safetensors
115
+
116
+ ### 30b GPTQ Models for 24GB GPUs (*** Requires using intfloat/e5-base-v2 instead of hkunlp/instructor-large as embedding model ***)
117
+ # model_id = "TheBloke/Wizard-Vicuna-30B-Uncensored-GPTQ"
118
+ # model_basename = "Wizard-Vicuna-30B-Uncensored-GPTQ-4bit--1g.act.order.safetensors"
119
+ # model_id = "TheBloke/WizardLM-30B-Uncensored-GPTQ"
120
+ # model_basename = "WizardLM-30B-Uncensored-GPTQ-4bit.act-order.safetensors"
121
+
122
+ ##### 8-10GB VRAM Graphics Cards (RTX 3080 - RTX 3080 Ti - RTX 3070 Ti - 3060 Ti - RTX 2000 Series, Quadro RTX 4000, 5000, 6000) #####
123
+ ### (*** Requires using intfloat/e5-small-v2 instead of hkunlp/instructor-large as embedding model ***)
124
+
125
+ ### 7b GPTQ Models for 8GB GPUs
126
+ # model_id = "TheBloke/Wizard-Vicuna-7B-Uncensored-GPTQ"
127
+ # model_basename = "Wizard-Vicuna-7B-Uncensored-GPTQ-4bit-128g.no-act.order.safetensors"
128
+ # model_id = "TheBloke/WizardLM-7B-uncensored-GPTQ"
129
+ # model_basename = "WizardLM-7B-uncensored-GPTQ-4bit-128g.compat.no-act-order.safetensors"
130
+ # model_id = "TheBloke/wizardLM-7B-GPTQ"
131
+ # model_basename = "wizardLM-7B-GPTQ-4bit.compat.no-act-order.safetensors"
132
+
133
+ ####
134
+ #### (FOR GGML) (Quantized cpu+gpu+mps) models - check if they support llama.cpp
135
+ ####
136
+
137
+ # MODEL_ID = "TheBloke/wizard-vicuna-13B-GGML"
138
+ # MODEL_BASENAME = "wizard-vicuna-13B.ggmlv3.q4_0.bin"
139
+ # MODEL_BASENAME = "wizard-vicuna-13B.ggmlv3.q6_K.bin"
140
+ # MODEL_BASENAME = "wizard-vicuna-13B.ggmlv3.q2_K.bin"
141
+ # MODEL_ID = "TheBloke/orca_mini_3B-GGML"
142
+ # MODEL_BASENAME = "orca-mini-3b.ggmlv3.q4_0.bin"
ingest.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
4
+
5
+ import click
6
+ import torch
7
+ from langchain.docstore.document import Document
8
+ from langchain.embeddings import HuggingFaceInstructEmbeddings
9
+ from langchain.text_splitter import Language, RecursiveCharacterTextSplitter
10
+ from langchain.vectorstores import Chroma
11
+
12
+
13
+
14
+ from constants import (
15
+ CHROMA_SETTINGS,
16
+ DOCUMENT_MAP,
17
+ EMBEDDING_MODEL_NAME,
18
+ INGEST_THREADS,
19
+ PERSIST_DIRECTORY,
20
+ SOURCE_DIRECTORY,
21
+ )
22
+
23
+
24
+ def load_single_document(file_path: str) -> Document:
25
+ # Loads a single document from a file path
26
+ file_extension = os.path.splitext(file_path)[1]
27
+ loader_class = DOCUMENT_MAP.get(file_extension)
28
+ if loader_class:
29
+ loader = loader_class(file_path)
30
+ else:
31
+ raise ValueError("Document type is undefined")
32
+ return loader.load()[0]
33
+
34
+
35
+ def load_document_batch(filepaths):
36
+ logging.info("Loading document batch")
37
+ # create a thread pool
38
+ with ThreadPoolExecutor(len(filepaths)) as exe:
39
+ # load files
40
+ futures = [exe.submit(load_single_document, name) for name in filepaths]
41
+ # collect data
42
+ data_list = [future.result() for future in futures]
43
+ # return data and file paths
44
+ return (data_list, filepaths)
45
+
46
+
47
+ def load_documents(source_dir: str) -> list[Document]:
48
+ # Loads all documents from the source documents directory, including nested folders
49
+ paths = []
50
+ for root, _, files in os.walk(source_dir):
51
+ for file_name in files:
52
+ file_extension = os.path.splitext(file_name)[1]
53
+ source_file_path = os.path.join(root, file_name)
54
+ if file_extension in DOCUMENT_MAP.keys():
55
+ paths.append(source_file_path)
56
+
57
+ # Have at least one worker and at most INGEST_THREADS workers
58
+ n_workers = min(INGEST_THREADS, max(len(paths), 1))
59
+ chunksize = round(len(paths) / n_workers)
60
+ docs = []
61
+ with ProcessPoolExecutor(n_workers) as executor:
62
+ futures = []
63
+ # split the load operations into chunks
64
+ for i in range(0, len(paths), chunksize):
65
+ # select a chunk of filenames
66
+ filepaths = paths[i : (i + chunksize)]
67
+ # submit the task
68
+ future = executor.submit(load_document_batch, filepaths)
69
+ futures.append(future)
70
+ # process all results
71
+ for future in as_completed(futures):
72
+ # open the file and load the data
73
+ contents, _ = future.result()
74
+ docs.extend(contents)
75
+
76
+ return docs
77
+
78
+
79
+ def split_documents(documents: list[Document]) -> tuple[list[Document], list[Document]]:
80
+ # Splits documents for correct Text Splitter
81
+ text_docs, python_docs = [], []
82
+ for doc in documents:
83
+ file_extension = os.path.splitext(doc.metadata["source"])[1]
84
+ if file_extension == ".py":
85
+ python_docs.append(doc)
86
+ else:
87
+ text_docs.append(doc)
88
+
89
+ return text_docs, python_docs
90
+
91
+
92
+ @click.command()
93
+ @click.option(
94
+ "--device_type",
95
+ default="cuda" if torch.cuda.is_available() else "cpu",
96
+ type=click.Choice(
97
+ [
98
+ "cpu",
99
+ "cuda",
100
+ "ipu",
101
+ "xpu",
102
+ "mkldnn",
103
+ "opengl",
104
+ "opencl",
105
+ "ideep",
106
+ "hip",
107
+ "ve",
108
+ "fpga",
109
+ "ort",
110
+ "xla",
111
+ "lazy",
112
+ "vulkan",
113
+ "mps",
114
+ "meta",
115
+ "hpu",
116
+ "mtia",
117
+ ],
118
+ ),
119
+ help="Device to run on. (Default is cuda)",
120
+ )
121
+ def main(device_type):
122
+ # Load documents and split in chunks
123
+ logging.info(f"Loading documents from {SOURCE_DIRECTORY}")
124
+ documents = load_documents(SOURCE_DIRECTORY)
125
+ text_documents, python_documents = split_documents(documents)
126
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
127
+ python_splitter = RecursiveCharacterTextSplitter.from_language(
128
+ language=Language.PYTHON, chunk_size=880, chunk_overlap=200
129
+ )
130
+ texts = text_splitter.split_documents(text_documents)
131
+ texts.extend(python_splitter.split_documents(python_documents))
132
+ logging.info(f"Loaded {len(documents)} documents from {SOURCE_DIRECTORY}")
133
+ logging.info(f"Split into {len(texts)} chunks of text")
134
+
135
+ # Create embeddings
136
+ embeddings = HuggingFaceInstructEmbeddings(
137
+ model_name=EMBEDDING_MODEL_NAME,
138
+ model_kwargs={"device": device_type},
139
+ )
140
+ # change the embedding type here if you are running into issues.
141
+ # These are much smaller embeddings and will work for most appications
142
+ # If you use HuggingFaceEmbeddings, make sure to also use the same in the
143
+ # run_localGPT.py file.
144
+
145
+ # embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
146
+
147
+ db = Chroma.from_documents(
148
+ texts,
149
+ embeddings,
150
+ persist_directory=PERSIST_DIRECTORY,
151
+ client_settings=CHROMA_SETTINGS,
152
+
153
+ )
154
+
155
+
156
+
157
+ if __name__ == "__main__":
158
+ logging.basicConfig(
159
+ format="%(asctime)s - %(levelname)s - %(filename)s:%(lineno)s - %(message)s", level=logging.INFO
160
+ )
161
+ main()
localGPT_UI.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import subprocess
3
+ import streamlit as st
4
+ from run_localGPT import load_model
5
+ from langchain.vectorstores import Chroma
6
+ from constants import CHROMA_SETTINGS, EMBEDDING_MODEL_NAME, PERSIST_DIRECTORY, MODEL_ID, MODEL_BASENAME
7
+ from langchain.embeddings import HuggingFaceInstructEmbeddings
8
+ from langchain.chains import RetrievalQA
9
+ from streamlit_extras.add_vertical_space import add_vertical_space
10
+ from langchain.prompts import PromptTemplate
11
+ from langchain.memory import ConversationBufferMemory
12
+
13
+
14
+
15
+ def model_memory():
16
+ # Adding history to the model.
17
+ template = """Use the following pieces of context to answer the question at the end. If you don't know the answer,\
18
+ just say that you don't know, don't try to make up an answer.
19
+
20
+ {context}
21
+
22
+ {history}
23
+ Question: {question}
24
+ Helpful Answer:"""
25
+
26
+ prompt = PromptTemplate(input_variables=["history", "context", "question"], template=template)
27
+ memory = ConversationBufferMemory(input_key="question", memory_key="history")
28
+
29
+ return prompt, memory
30
+
31
+ # Sidebar contents
32
+ with st.sidebar:
33
+ st.title('🤗💬 Converse with your Data')
34
+ st.markdown('''
35
+ ## About
36
+ This app is an LLM-powered chatbot built using:
37
+ - [Streamlit](https://streamlit.io/)
38
+ - [LangChain](https://python.langchain.com/)
39
+ - [LocalGPT](https://github.com/PromtEngineer/localGPT)
40
+
41
+ ''')
42
+ add_vertical_space(5)
43
+ st.write('Made with ❤️ by [Prompt Engineer](https://youtube.com/@engineerprompt)')
44
+
45
+
46
+ DEVICE_TYPE = "cuda" if torch.cuda.is_available() else "cpu"
47
+
48
+
49
+
50
+ if "result" not in st.session_state:
51
+ # Run the document ingestion process.
52
+ run_langest_commands = ["python", "ingest.py"]
53
+ run_langest_commands.append("--device_type")
54
+ run_langest_commands.append(DEVICE_TYPE)
55
+
56
+ result = subprocess.run(run_langest_commands, capture_output=True)
57
+ st.session_state.result = result
58
+
59
+ # Define the retreiver
60
+ # load the vectorstore
61
+ if "EMBEDDINGS" not in st.session_state:
62
+ EMBEDDINGS = HuggingFaceInstructEmbeddings(model_name=EMBEDDING_MODEL_NAME, model_kwargs={"device": DEVICE_TYPE})
63
+ st.session_state.EMBEDDINGS = EMBEDDINGS
64
+
65
+ if "DB" not in st.session_state:
66
+ DB = Chroma(
67
+ persist_directory=PERSIST_DIRECTORY,
68
+ embedding_function=st.session_state.EMBEDDINGS,
69
+ client_settings=CHROMA_SETTINGS,
70
+ )
71
+ st.session_state.DB = DB
72
+
73
+ if "RETRIEVER" not in st.session_state:
74
+ RETRIEVER = DB.as_retriever()
75
+ st.session_state.RETRIEVER = RETRIEVER
76
+
77
+ if "LLM" not in st.session_state:
78
+ LLM = load_model(device_type=DEVICE_TYPE, model_id=MODEL_ID, model_basename=MODEL_BASENAME)
79
+ st.session_state["LLM"] = LLM
80
+
81
+
82
+
83
+
84
+ if "QA" not in st.session_state:
85
+
86
+ prompt, memory = model_memory()
87
+
88
+ QA = RetrievalQA.from_chain_type(
89
+ llm=LLM,
90
+ chain_type="stuff",
91
+ retriever=RETRIEVER,
92
+ return_source_documents=True,
93
+ chain_type_kwargs={"prompt": prompt, "memory": memory},
94
+ )
95
+ st.session_state["QA"] = QA
96
+
97
+ st.title('LocalGPT App 💬')
98
+ # Create a text input box for the user
99
+ prompt = st.text_input('Input your prompt here')
100
+ # while True:
101
+
102
+ # If the user hits enter
103
+ if prompt:
104
+ # Then pass the prompt to the LLM
105
+ response = st.session_state["QA"](prompt)
106
+ answer, docs = response["result"], response["source_documents"]
107
+ # ...and write it out to the screen
108
+ st.write(answer)
109
+
110
+ # With a streamlit expander
111
+ with st.expander('Document Similarity Search'):
112
+ # Find the relevant pages
113
+ search = st.session_state.DB.similarity_search_with_score(prompt)
114
+ # Write out the first
115
+ for i, doc in enumerate(search):
116
+ # print(doc)
117
+ st.write(f"Source Document # {i+1} : {doc[0].metadata['source'].split('/')[-1]}")
118
+ st.write(doc[0].page_content)
119
+ st.write("--------------------------------")
pyproject.toml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ==== black ====
2
+ [tool.black]
3
+ line-length = 119
4
+ target-version = ['py311']
5
+
6
+
7
+ # ==== isort ====
8
+ [tool.isort]
9
+ profile = "black"
10
+ line_length = 119
11
+ known_first_party = [
12
+ "tests",
13
+ "scripts",
14
+ "hooks",
15
+ ]
requirements.txt ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Processing
2
+ langchain==0.0.267
3
+ chromadb==0.4.6
4
+ llama-cpp-python==0.1.78
5
+ pdfminer.six==20221105
6
+ InstructorEmbedding
7
+ sentence-transformers
8
+ faiss-cpu
9
+ huggingface_hub
10
+ transformers
11
+ protobuf==3.20.0; sys_platform != 'darwin'
12
+ protobuf==3.20.0; sys_platform == 'darwin' and platform_machine != 'arm64'
13
+ protobuf==3.20.3; sys_platform == 'darwin' and platform_machine == 'arm64'
14
+ auto-gptq==0.2.2
15
+ docx2txt
16
+ unstructured
17
+
18
+ # Utilities
19
+ urllib3==1.26.6
20
+ accelerate
21
+ bitsandbytes ; sys_platform != 'win32'
22
+ bitsandbytes-windows ; sys_platform == 'win32'
23
+ click
24
+ flask
25
+ requests
26
+
27
+ # Streamlit related
28
+ streamlit
29
+ Streamlit-extras
30
+
31
+ # Excel File Manipulation
32
+ openpyxl
run_localGPT.py ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+ import click
4
+ import torch
5
+ from auto_gptq import AutoGPTQForCausalLM
6
+ from huggingface_hub import hf_hub_download
7
+ from langchain.chains import RetrievalQA
8
+ from langchain.embeddings import HuggingFaceInstructEmbeddings
9
+ from langchain.llms import HuggingFacePipeline, LlamaCpp
10
+ from langchain.memory import ConversationBufferMemory
11
+ from langchain.prompts import PromptTemplate
12
+
13
+
14
+ # from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
15
+ from langchain.vectorstores import Chroma
16
+ from transformers import (
17
+ AutoModelForCausalLM,
18
+ AutoTokenizer,
19
+ GenerationConfig,
20
+ LlamaForCausalLM,
21
+ LlamaTokenizer,
22
+ pipeline,
23
+ )
24
+
25
+ from constants import EMBEDDING_MODEL_NAME, PERSIST_DIRECTORY, MODEL_ID, MODEL_BASENAME
26
+
27
+
28
+ def load_model(device_type, model_id, model_basename=None):
29
+ """
30
+ Select a model for text generation using the HuggingFace library.
31
+ If you are running this for the first time, it will download a model for you.
32
+ subsequent runs will use the model from the disk.
33
+
34
+ Args:
35
+ device_type (str): Type of device to use, e.g., "cuda" for GPU or "cpu" for CPU.
36
+ model_id (str): Identifier of the model to load from HuggingFace's model hub.
37
+ model_basename (str, optional): Basename of the model if using quantized models.
38
+ Defaults to None.
39
+
40
+ Returns:
41
+ HuggingFacePipeline: A pipeline object for text generation using the loaded model.
42
+
43
+ Raises:
44
+ ValueError: If an unsupported model or device type is provided.
45
+ """
46
+ logging.info(f"Loading Model: {model_id}, on: {device_type}")
47
+ logging.info("This action can take a few minutes!")
48
+
49
+ if model_basename is not None:
50
+ if ".ggml" in model_basename:
51
+ logging.info("Using Llamacpp for GGML quantized models")
52
+ model_path = hf_hub_download(repo_id=model_id, filename=model_basename, resume_download=True)
53
+ max_ctx_size = 2048
54
+ kwargs = {
55
+ "model_path": model_path,
56
+ "n_ctx": max_ctx_size,
57
+ "max_tokens": max_ctx_size,
58
+ }
59
+ if device_type.lower() == "mps":
60
+ kwargs["n_gpu_layers"] = 1000
61
+ if device_type.lower() == "cuda":
62
+ kwargs["n_gpu_layers"] = 1000
63
+ kwargs["n_batch"] = max_ctx_size
64
+ return LlamaCpp(**kwargs)
65
+
66
+ else:
67
+ # The code supports all huggingface models that ends with GPTQ and have some variation
68
+ # of .no-act.order or .safetensors in their HF repo.
69
+ logging.info("Using AutoGPTQForCausalLM for quantized models")
70
+
71
+ if ".safetensors" in model_basename:
72
+ # Remove the ".safetensors" ending if present
73
+ model_basename = model_basename.replace(".safetensors", "")
74
+
75
+ tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
76
+ logging.info("Tokenizer loaded")
77
+
78
+ model = AutoGPTQForCausalLM.from_quantized(
79
+ model_id,
80
+ model_basename=model_basename,
81
+ use_safetensors=True,
82
+ trust_remote_code=True,
83
+ device="cuda:0",
84
+ use_triton=False,
85
+ quantize_config=None,
86
+ )
87
+ elif (
88
+ device_type.lower() == "cuda"
89
+ ): # The code supports all huggingface models that ends with -HF or which have a .bin
90
+ # file in their HF repo.
91
+ logging.info("Using AutoModelForCausalLM for full models")
92
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
93
+ logging.info("Tokenizer loaded")
94
+
95
+ model = AutoModelForCausalLM.from_pretrained(
96
+ model_id,
97
+ device_map="auto",
98
+ torch_dtype=torch.float16,
99
+ low_cpu_mem_usage=True,
100
+ trust_remote_code=True,
101
+ # max_memory={0: "15GB"} # Uncomment this line with you encounter CUDA out of memory errors
102
+ )
103
+ model.tie_weights()
104
+ else:
105
+ logging.info("Using LlamaTokenizer")
106
+ tokenizer = LlamaTokenizer.from_pretrained(model_id)
107
+ model = LlamaForCausalLM.from_pretrained(model_id)
108
+
109
+ # Load configuration from the model to avoid warnings
110
+ generation_config = GenerationConfig.from_pretrained(model_id)
111
+ # see here for details:
112
+ # https://huggingface.co/docs/transformers/
113
+ # main_classes/text_generation#transformers.GenerationConfig.from_pretrained.returns
114
+
115
+ # Create a pipeline for text generation
116
+ pipe = pipeline(
117
+ "text-generation",
118
+ model=model,
119
+ tokenizer=tokenizer,
120
+ max_length=2048,
121
+ temperature=0,
122
+ top_p=0.95,
123
+ repetition_penalty=1.15,
124
+ generation_config=generation_config,
125
+ )
126
+
127
+ local_llm = HuggingFacePipeline(pipeline=pipe)
128
+ logging.info("Local LLM Loaded")
129
+
130
+ return local_llm
131
+
132
+
133
+ # chose device typ to run on as well as to show source documents.
134
+ @click.command()
135
+ @click.option(
136
+ "--device_type",
137
+ default="cuda" if torch.cuda.is_available() else "cpu",
138
+ type=click.Choice(
139
+ [
140
+ "cpu",
141
+ "cuda",
142
+ "ipu",
143
+ "xpu",
144
+ "mkldnn",
145
+ "opengl",
146
+ "opencl",
147
+ "ideep",
148
+ "hip",
149
+ "ve",
150
+ "fpga",
151
+ "ort",
152
+ "xla",
153
+ "lazy",
154
+ "vulkan",
155
+ "mps",
156
+ "meta",
157
+ "hpu",
158
+ "mtia",
159
+ ],
160
+ ),
161
+ help="Device to run on. (Default is cuda)",
162
+ )
163
+ @click.option(
164
+ "--show_sources",
165
+ "-s",
166
+ is_flag=True,
167
+ help="Show sources along with answers (Default is False)",
168
+ )
169
+ def main(device_type, show_sources):
170
+ """
171
+ This function implements the information retrieval task.
172
+
173
+
174
+ 1. Loads an embedding model, can be HuggingFaceInstructEmbeddings or HuggingFaceEmbeddings
175
+ 2. Loads the existing vectorestore that was created by inget.py
176
+ 3. Loads the local LLM using load_model function - You can now set different LLMs.
177
+ 4. Setup the Question Answer retreival chain.
178
+ 5. Question answers.
179
+ """
180
+
181
+ logging.info(f"Running on: {device_type}")
182
+ logging.info(f"Display Source Documents set to: {show_sources}")
183
+
184
+ embeddings = HuggingFaceInstructEmbeddings(model_name=EMBEDDING_MODEL_NAME, model_kwargs={"device": device_type})
185
+
186
+ # uncomment the following line if you used HuggingFaceEmbeddings in the ingest.py
187
+ # embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
188
+
189
+ # load the vectorstore
190
+ db = Chroma(
191
+ persist_directory=PERSIST_DIRECTORY,
192
+ embedding_function=embeddings,
193
+
194
+ )
195
+ retriever = db.as_retriever()
196
+
197
+
198
+ template = """Use the following pieces of context to answer the question at the end. If you don't know the answer,\
199
+ just say that you don't know, don't try to make up an answer.
200
+
201
+ {context}
202
+
203
+ {history}
204
+ Question: {question}
205
+ Helpful Answer:"""
206
+
207
+ prompt = PromptTemplate(input_variables=["history", "context", "question"], template=template)
208
+ memory = ConversationBufferMemory(input_key="question", memory_key="history")
209
+
210
+ llm = load_model(device_type, model_id=MODEL_ID, model_basename=MODEL_BASENAME)
211
+
212
+ qa = RetrievalQA.from_chain_type(
213
+ llm=llm,
214
+ chain_type="stuff",
215
+ retriever=retriever,
216
+ return_source_documents=True,
217
+ chain_type_kwargs={"prompt": prompt, "memory": memory},
218
+ )
219
+ # Interactive questions and answers
220
+ while True:
221
+ query = input("\nEnter a query: ")
222
+ if query == "exit":
223
+ break
224
+ # Get the answer from the chain
225
+ res = qa(query)
226
+ answer, docs = res["result"], res["source_documents"]
227
+
228
+ # Print the result
229
+ print("\n\n> Question:")
230
+ print(query)
231
+ print("\n> Answer:")
232
+ print(answer)
233
+
234
+ if show_sources: # this is a flag that you can set to disable showing answers.
235
+ # # Print the relevant sources used for the answer
236
+ print("----------------------------------SOURCE DOCUMENTS---------------------------")
237
+ for document in docs:
238
+ print("\n> " + document.metadata["source"] + ":")
239
+ print(document.page_content)
240
+ print("----------------------------------SOURCE DOCUMENTS---------------------------")
241
+
242
+
243
+ if __name__ == "__main__":
244
+ logging.basicConfig(
245
+ format="%(asctime)s - %(levelname)s - %(filename)s:%(lineno)s - %(message)s", level=logging.INFO
246
+ )
247
+ main()
run_localGPT_API.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import shutil
4
+ import subprocess
5
+
6
+ import torch
7
+ from auto_gptq import AutoGPTQForCausalLM
8
+ from flask import Flask, jsonify, request
9
+ from langchain.chains import RetrievalQA
10
+ from langchain.embeddings import HuggingFaceInstructEmbeddings
11
+
12
+ # from langchain.embeddings import HuggingFaceEmbeddings
13
+ from langchain.llms import HuggingFacePipeline
14
+ from run_localGPT import load_model
15
+
16
+ # from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
17
+ from langchain.vectorstores import Chroma
18
+ from transformers import (
19
+ AutoModelForCausalLM,
20
+ AutoTokenizer,
21
+ GenerationConfig,
22
+ LlamaForCausalLM,
23
+ LlamaTokenizer,
24
+ pipeline,
25
+ )
26
+ from werkzeug.utils import secure_filename
27
+
28
+ from constants import CHROMA_SETTINGS, EMBEDDING_MODEL_NAME, PERSIST_DIRECTORY, MODEL_ID, MODEL_BASENAME
29
+
30
+ DEVICE_TYPE = "cuda" if torch.cuda.is_available() else "cpu"
31
+ SHOW_SOURCES = True
32
+ logging.info(f"Running on: {DEVICE_TYPE}")
33
+ logging.info(f"Display Source Documents set to: {SHOW_SOURCES}")
34
+
35
+ EMBEDDINGS = HuggingFaceInstructEmbeddings(model_name=EMBEDDING_MODEL_NAME, model_kwargs={"device": DEVICE_TYPE})
36
+
37
+ # uncomment the following line if you used HuggingFaceEmbeddings in the ingest.py
38
+ # EMBEDDINGS = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
39
+ if os.path.exists(PERSIST_DIRECTORY):
40
+ try:
41
+ shutil.rmtree(PERSIST_DIRECTORY)
42
+ except OSError as e:
43
+ print(f"Error: {e.filename} - {e.strerror}.")
44
+ else:
45
+ print("The directory does not exist")
46
+
47
+ run_langest_commands = ["python", "ingest.py"]
48
+ if DEVICE_TYPE == "cpu":
49
+ run_langest_commands.append("--device_type")
50
+ run_langest_commands.append(DEVICE_TYPE)
51
+
52
+ result = subprocess.run(run_langest_commands, capture_output=True)
53
+ if result.returncode != 0:
54
+ raise FileNotFoundError(
55
+ "No files were found inside SOURCE_DOCUMENTS, please put a starter file inside before starting the API!"
56
+ )
57
+
58
+ # load the vectorstore
59
+ DB = Chroma(
60
+ persist_directory=PERSIST_DIRECTORY,
61
+ embedding_function=EMBEDDINGS,
62
+ client_settings=CHROMA_SETTINGS,
63
+ )
64
+
65
+ RETRIEVER = DB.as_retriever()
66
+
67
+ LLM = load_model(device_type=DEVICE_TYPE, model_id=MODEL_ID, model_basename=MODEL_BASENAME)
68
+
69
+ QA = RetrievalQA.from_chain_type(
70
+ llm=LLM, chain_type="stuff", retriever=RETRIEVER, return_source_documents=SHOW_SOURCES
71
+ )
72
+
73
+ app = Flask(__name__)
74
+
75
+
76
+ @app.route("/api/delete_source", methods=["GET"])
77
+ def delete_source_route():
78
+ folder_name = "SOURCE_DOCUMENTS"
79
+
80
+ if os.path.exists(folder_name):
81
+ shutil.rmtree(folder_name)
82
+
83
+ os.makedirs(folder_name)
84
+
85
+ return jsonify({"message": f"Folder '{folder_name}' successfully deleted and recreated."})
86
+
87
+
88
+ @app.route("/api/save_document", methods=["GET", "POST"])
89
+ def save_document_route():
90
+ if "document" not in request.files:
91
+ return "No document part", 400
92
+ file = request.files["document"]
93
+ if file.filename == "":
94
+ return "No selected file", 400
95
+ if file:
96
+ filename = secure_filename(file.filename)
97
+ folder_path = "SOURCE_DOCUMENTS"
98
+ if not os.path.exists(folder_path):
99
+ os.makedirs(folder_path)
100
+ file_path = os.path.join(folder_path, filename)
101
+ file.save(file_path)
102
+ return "File saved successfully", 200
103
+
104
+
105
+ @app.route("/api/run_ingest", methods=["GET"])
106
+ def run_ingest_route():
107
+ global DB
108
+ global RETRIEVER
109
+ global QA
110
+ try:
111
+ if os.path.exists(PERSIST_DIRECTORY):
112
+ try:
113
+ shutil.rmtree(PERSIST_DIRECTORY)
114
+ except OSError as e:
115
+ print(f"Error: {e.filename} - {e.strerror}.")
116
+ else:
117
+ print("The directory does not exist")
118
+
119
+ run_langest_commands = ["python", "ingest.py"]
120
+ if DEVICE_TYPE == "cpu":
121
+ run_langest_commands.append("--device_type")
122
+ run_langest_commands.append(DEVICE_TYPE)
123
+
124
+ result = subprocess.run(run_langest_commands, capture_output=True)
125
+ if result.returncode != 0:
126
+ return "Script execution failed: {}".format(result.stderr.decode("utf-8")), 500
127
+ # load the vectorstore
128
+ DB = Chroma(
129
+ persist_directory=PERSIST_DIRECTORY,
130
+ embedding_function=EMBEDDINGS,
131
+ client_settings=CHROMA_SETTINGS,
132
+ )
133
+ RETRIEVER = DB.as_retriever()
134
+
135
+ QA = RetrievalQA.from_chain_type(
136
+ llm=LLM, chain_type="stuff", retriever=RETRIEVER, return_source_documents=SHOW_SOURCES
137
+ )
138
+ return "Script executed successfully: {}".format(result.stdout.decode("utf-8")), 200
139
+ except Exception as e:
140
+ return f"Error occurred: {str(e)}", 500
141
+
142
+
143
+ @app.route("/api/prompt_route", methods=["GET", "POST"])
144
+ def prompt_route():
145
+ global QA
146
+ user_prompt = request.form.get("user_prompt")
147
+ if user_prompt:
148
+ # print(f'User Prompt: {user_prompt}')
149
+ # Get the answer from the chain
150
+ res = QA(user_prompt)
151
+ answer, docs = res["result"], res["source_documents"]
152
+
153
+ prompt_response_dict = {
154
+ "Prompt": user_prompt,
155
+ "Answer": answer,
156
+ }
157
+
158
+ prompt_response_dict["Sources"] = []
159
+ for document in docs:
160
+ prompt_response_dict["Sources"].append(
161
+ (os.path.basename(str(document.metadata["source"])), str(document.page_content))
162
+ )
163
+
164
+ return jsonify(prompt_response_dict), 200
165
+ else:
166
+ return "No user prompt received", 400
167
+
168
+
169
+ if __name__ == "__main__":
170
+ logging.basicConfig(
171
+ format="%(asctime)s - %(levelname)s - %(filename)s:%(lineno)s - %(message)s", level=logging.INFO
172
+ )
173
+ app.run(debug=False, port=5110)