Spaces:

csalabs
/

AI-EMBD

Runtime error

App Files Files Community

csalabs commited on Sep 14, 2023

Commit

66adac7

1 Parent(s): 036ce5b

Upload 17 files

Browse files

Files changed (17) hide show

.dockerignore +4 -0
.editorconfig +17 -0
.flake8 +4 -0
.pre-commit-config.yaml +49 -0
.pyup.yml +17 -0
ACKNOWLEDGEMENT.md +10 -0
CONTRIBUTING.md +47 -0
Dockerfile +21 -0
LICENSE +201 -0
README.md +287 -13
constants.py +142 -0
ingest.py +161 -0
localGPT_UI.py +119 -0
pyproject.toml +15 -0
requirements.txt +32 -0
run_localGPT.py +247 -0
run_localGPT_API.py +173 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,4 @@

+*
+!*.py
+!requirements.txt
+!SOURCE_DOCUMENTS

.editorconfig ADDED Viewed

	@@ -0,0 +1,17 @@

+# http://editorconfig.org
+root = true
+[*]
+charset = utf-8
+end_of_line = lf
+insert_final_newline = true
+trim_trailing_whitespace = true
+[*.{py,rst,ini}]
+indent_style = space
+indent_size = 4
+[*.{html,css,scss,json,yml,xml}]
+indent_style = space
+indent_size = 2

.flake8 ADDED Viewed

	@@ -0,0 +1,4 @@

+[flake8]
+exclude = docs
+max-line-length = 119
+extend-ignore = E203

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,49 @@

+default_stages: [commit]
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.4.0
+    hooks:
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+      - id: check-json
+      - id: check-toml
+      - id: check-xml
+      - id: check-yaml
+      - id: debug-statements
+      - id: check-builtin-literals
+      - id: check-case-conflict
+      - id: detect-private-key
+  - repo: https://github.com/pre-commit/mirrors-prettier
+    rev: "v3.0.0-alpha.9-for-vscode"
+    hooks:
+      - id: prettier
+        args: ["--tab-width", "2"]
+  - repo: https://github.com/asottile/pyupgrade
+    rev: v3.4.0
+    hooks:
+      - id: pyupgrade
+        args: [--py311-plus]
+        exclude: hooks/
+  - repo: https://github.com/psf/black
+    rev: 23.3.0
+    hooks:
+      - id: black
+  - repo: https://github.com/PyCQA/isort
+    rev: 5.12.0
+    hooks:
+      - id: isort
+  - repo: https://github.com/PyCQA/flake8
+    rev: 6.0.0
+    hooks:
+      - id: flake8
+ci:
+  autoupdate_schedule: weekly
+  skip: []
+  submodules: false

.pyup.yml ADDED Viewed

	@@ -0,0 +1,17 @@

+# configure updates globally
+# default: all
+# allowed: all, insecure, False
+update: all
+# configure dependency pinning globally
+# default: True
+# allowed: True, False
+pin: True
+# add a label to pull requests, default is not set
+# requires private repo permissions, even on public repos
+# default: empty
+label_prs: update
+requirements:
+  - "requirements.txt"

ACKNOWLEDGEMENT.md ADDED Viewed

	@@ -0,0 +1,10 @@

+# Acknowledgments
+Some code was taken or inspired from other projects:-
+- [CookieCutter Django][cookiecutter-django]
+  - `pre-commit-config.yaml` is taken from there with almost no changes
+  - `github-actions.yml` is inspired by `gitlab-ci.yml`
+  - `.pyup.yml`, `.flake8`, `.editorconfig`, `pyproject.toml` are taken from there with minor changes,
+[cookiecutter-django]: https://github.com/cookiecutter/cookiecutter-django

CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,47 @@

+# How to Contribute
+Always happy to get issues identified and pull requests!
+## General considerations
+1. Keep it small. The smaller the change, the more likely we are to accept.
+2. Changes that fix a current issue get priority for review.
+3. Check out [GitHub guide][submit-a-pr] if you've never created a pull request before.
+## Getting started
+1. Fork the repo
+2. Clone your fork
+3. Create a branch for your changes
+This last step is very important, don't start developing from master, it'll cause pain if you need to send another change later.
+TIP: If you're working on a GitHub issue, name your branch after the issue number, e.g. `issue-123-<ISSUE-NAME>`. This will help us keep track of what you're working on. If there is not an issue for what you're working on, create one first please. Someone else might be working on the same thing, or we might have a reason for not wanting to do it.
+## Pre-commit
+GitHub Actions is going to run Pre-commit hooks on your PR. If the hooks fail, you will need to fix them before your PR can be merged. It will save you a lot of time if you run the hooks locally before you push your changes. To do that, you need to install pre-commit on your local machine.
+```shell
+pip install pre-commit
+```
+Once installed, you need to add the pre-commit hooks to your local repo.
+```shell
+pre-commit install
+```
+Now, every time you commit, the hooks will run and check your code. If they fail, you will need to fix them before you can commit.
+If it happened that you committed changes already without having pre-commit hooks and do not want to reset and recommit again, you can run the following command to run the hooks on your local repo.
+```shell
+pre-commit run --all-files
+```
+## Help Us Improve This Documentation
+If you find that something is missing or have suggestions for improvements, please submit a PR.
+[submit-a-pr]: https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request

Dockerfile ADDED Viewed

	@@ -0,0 +1,21 @@

+# syntax=docker/dockerfile:1
+# Build as `docker build . -t localgpt`, requires BuildKit.
+# Run as `docker run -it --mount src="$HOME/.cache",target=/root/.cache,type=bind --gpus=all localgpt`, requires Nvidia container toolkit.
+FROM nvidia/cuda:11.7.1-runtime-ubuntu22.04
+RUN apt-get update && apt-get install -y software-properties-common
+RUN apt-get install -y g++-11 make python3 python-is-python3 pip
+# only copy what's needed at every step to optimize layer cache
+COPY ./requirements.txt .
+# use BuildKit cache mount to drastically reduce redownloading from pip on repeated builds
+RUN --mount=type=cache,target=/root/.cache CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install --timeout 100 -r requirements.txt
+COPY SOURCE_DOCUMENTS ./SOURCE_DOCUMENTS
+COPY ingest.py constants.py ./
+# Docker BuildKit does not support GPU during *docker build* time right now, only during *docker run*.
+# See <https://github.com/moby/buildkit/issues/1436>.
+# If this changes in the future you can `docker build --build-arg device_type=cuda  . -t localgpt` (+GPU argument to be determined).
+ARG device_type=cpu
+RUN --mount=type=cache,target=/root/.cache python ingest.py --device_type $device_type
+COPY . .
+ENV device_type=cuda
+CMD python run_localGPT.py --device_type $device_type

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,13 +1,287 @@
----
-title: AI EMBD
-emoji: 🐠
-colorFrom: indigo
-colorTo: green
-sdk: gradio
-sdk_version: 3.44.2
-app_file: app.py
-pinned: false
-license: llama2
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# localGPT
+This project was inspired by the original [privateGPT](https://github.com/imartinez/privateGPT). Most of the description here is inspired by the original privateGPT.
+For detailed overview of the project, Watch these videos
+- [Detailed code-walkthrough](https://youtu.be/MlyoObdIHyo).
+- [Llama-2 with LocalGPT](https://youtu.be/lbFmceo4D5E)
+- [Adding Chat History](https://youtu.be/d7otIM_MCZs)
+In this model, I have replaced the GPT4ALL model with Vicuna-7B model and we are using the InstructorEmbeddings instead of LlamaEmbeddings as used in the original privateGPT. Both Embeddings as well as LLM will run on GPU instead of CPU. It also has CPU support if you do not have a GPU (see below for instruction).
+Ask questions to your documents without an internet connection, using the power of LLMs. 100% private, no data leaves your execution environment at any point. You can ingest documents and ask questions without an internet connection!
+Built with [LangChain](https://github.com/hwchase17/langchain) and [Vicuna-7B](https://huggingface.co/TheBloke/vicuna-7B-1.1-HF) (+ alot more!) and [InstructorEmbeddings](https://instructor-embedding.github.io/)
+# Environment Setup
+Install conda
+```shell
+conda create -n localGPT
+```
+Activate
+```shell
+conda activate localGPT
+```
+In order to set your environment up to run the code here, first install all requirements:
+```shell
+pip install -r requirements.txt
+```
+If you want to use BLAS or Metal with [llama-cpp](<(https://github.com/abetlen/llama-cpp-python#installation-with-openblas--cublas--clblast--metal)>) you can set appropriate flags:
+```shell
+# Example: cuBLAS
+CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install -r requirements.txt
+```
+## Docker
+Installing the required packages for GPU inference on Nvidia GPUs, like gcc 11 and CUDA 11, may cause conflicts with other packages in your system.
+As an alternative to Conda, you can use Docker with the provided Dockerfile.
+It includes CUDA, your system just needs Docker, BuildKit, your Nvidia GPU driver and the Nvidia container toolkit.
+Build as `docker build . -t localgpt`, requires BuildKit.
+Docker BuildKit does not support GPU during *docker build* time right now, only during *docker run*.
+Run as `docker run -it --mount src="$HOME/.cache",target=/root/.cache,type=bind --gpus=all localgpt`.
+## Test dataset
+This repo uses a 4 PDF of Ontorio Rule Book as an example.
+## Instructions for ingesting your own dataset
+Put any and all of your .txt, .pdf, or .csv files into the SOURCE_DOCUMENTS directory
+in the load_documents() function, replace the docs_path with the absolute path of your source_documents directory.
+The current default file types are .txt, .pdf, .csv, and .xlsx, if you want to use any other file type, you will need to convert it to one of the default file types.
+Run the following command to ingest all the data.
+`defaults to cuda`
+```shell
+python ingest.py
+```
+Use the device type argument to specify a given device.
+```sh
+python ingest.py --device_type cpu
+```
+Use help for a full list of supported devices.
+```sh
+python ingest.py --help
+```
+It will create an index containing the local vectorstore. Will take time, depending on the size of your documents.
+You can ingest as many documents as you want, and all will be accumulated in the local embeddings database.
+If you want to start from an empty database, delete the `index`.
+Note: When you run this for the first time, it will download take time as it has to download the embedding model. In the subseqeunt runs, no data will leave your local enviroment and can be run without internet connection.
+## Ask questions to your documents, locally!
+In order to ask a question, run a command like:
+```shell
+python run_localGPT.py
+```
+And wait for the script to require your input.
+```shell
+> Enter a query:
+```
+Hit enter. Wait while the LLM model consumes the prompt and prepares the answer. Once done, it will print the answer and the 4 sources it used as context from your documents; you can then ask another question without re-running the script, just wait for the prompt again.
+Note: When you run this for the first time, it will need internet connection to download the vicuna-7B model. After that you can turn off your internet connection, and the script inference would still work. No data gets out of your local environment.
+Type `exit` to finish the script.
+# Run it on CPU
+By default, localGPT will use your GPU to run both the `ingest.py` and `run_localGPT.py` scripts. But if you do not have a GPU and want to run this on CPU, now you can do that (Warning: Its going to be slow!). You will need to use `--device_type cpu`flag with both scripts.
+For Ingestion run the following:
+```shell
+python ingest.py --device_type cpu
+```
+In order to ask a question, run a command like:
+```shell
+python run_localGPT.py --device_type cpu
+```
+# Run quantized for M1/M2:
+GGML quantized models for Apple Silicon (M1/M2) are supported through the llama-cpp library, [example](https://huggingface.co/TheBloke/Wizard-Vicuna-13B-Uncensored-GGML). GPTQ quantized models that leverage auto-gptq will not work, [see here](https://github.com/PanQiWei/AutoGPTQ/issues/133#issuecomment-1575002893). GGML models will work for CPU or MPS.
+## Troubleshooting
+**Install MPS:**
+1- Follow this [page](https://developer.apple.com/metal/pytorch/) to build up PyTorch with Metal Performance Shaders (MPS) support. PyTorch uses the new MPS backend for GPU training acceleration. It is good practice to verify mps support using a simple Python script as mentioned in the provided link.
+2- By following the page, here is an example of what you may initiate in your terminal
+```shell
+xcode-select --install
+conda install pytorch torchvision torchaudio -c pytorch-nightly
+pip install chardet
+pip install cchardet
+pip uninstall charset_normalizer
+pip install charset_normalizer
+pip install pdfminer.six
+pip install xformers
+```
+**Upgrade packages:**
+Your langchain or llama-cpp version could be outdated. Upgrade your packages by running install again.
+```shell
+pip install -r requirements.txt
+```
+If you are still getting errors, try installing the latest llama-cpp-python with these flags, and [see thread](https://github.com/abetlen/llama-cpp-python/issues/317#issuecomment-1587962205).
+```shell
+CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install -U llama-cpp-python --no-cache-dir
+```
+# Run the UI
+1. Open `constants.py` in an editor of your choice and depending on choice add the LLM you want to use. By default, the following model will be used:
+   ```shell
+   MODEL_ID = "TheBloke/Llama-2-7B-Chat-GGML"
+   MODEL_BASENAME = "llama-2-7b-chat.ggmlv3.q4_0.bin"
+   ```
+3. Open up a terminal and activate your python environment that contains the dependencies installed from requirements.txt.
+4. Navigate to the `/LOCALGPT` directory.
+5. Run the following command `python run_localGPT_API.py`. The API should being to run.
+6. Wait until everything has loaded in. You should see something like `INFO:werkzeug:Press CTRL+C to quit`.
+7. Open up a second terminal and activate the same python environment.
+8. Navigate to the `/LOCALGPT/localGPTUI` directory.
+9. Run the command `python localGPTUI.py`.
+10. Open up a web browser and go the address `http://localhost:5111/`.
+# How does it work?
+Selecting the right local models and the power of `LangChain` you can run the entire pipeline locally, without any data leaving your environment, and with reasonable performance.
+- `ingest.py` uses `LangChain` tools to parse the document and create embeddings locally using `InstructorEmbeddings`. It then stores the result in a local vector database using `Chroma` vector store.
+- `run_localGPT.py` uses a local LLM to understand questions and create answers. The context for the answers is extracted from the local vector store using a similarity search to locate the right piece of context from the docs.
+- You can replace this local LLM with any other LLM from the HuggingFace. Make sure whatever LLM you select is in the HF format.
+# How to select different LLM models?
+The following will provide instructions on how you can select a different LLM model to create your response:
+1. Open up `constants.py` in the editor of your choice.
+2. Change the `MODEL_ID` and `MODEL_BASENAME`. If you are using a quantized model (`GGML`, `GPTQ`), you will need to provide `MODEL_BASENAME`. For unquatized models, set `MODEL_BASENAME` to `NONE`
+5. There are a number of example models from HuggingFace that have already been tested to be run with the original trained model (ending with HF or have a .bin in its "Files and versions"), and quantized models (ending with GPTQ or have a .no-act-order or .safetensors in its "Files and versions").
+6. For models that end with HF or have a .bin inside its "Files and versions" on its HuggingFace page.
+   - Make sure you have a model_id selected. For example -> `MODEL_ID = "TheBloke/guanaco-7B-HF"`
+   - If you go to its HuggingFace [repo](https://huggingface.co/TheBloke/guanaco-7B-HF) and go to "Files and versions" you will notice model files that end with a .bin extension.
+   - Any model files that contain .bin extensions will be run with the following code where the `# load the LLM for generating Natural Language responses` comment is found.
+   - `MODEL_ID = "TheBloke/guanaco-7B-HF"`
+7. For models that contain GPTQ in its name and or have a .no-act-order or .safetensors extension inside its "Files and versions on its HuggingFace page.
+   - Make sure you have a model_id selected. For example -> model_id = `"TheBloke/wizardLM-7B-GPTQ"`
+   - You will also need its model basename file selected. For example -> `model_basename = "wizardLM-7B-GPTQ-4bit.compat.no-act-order.safetensors"`
+   - If you go to its HuggingFace [repo](https://huggingface.co/TheBloke/wizardLM-7B-GPTQ) and go to "Files and versions" you will notice a model file that ends with a .safetensors extension.
+   - Any model files that contain no-act-order or .safetensors extensions will be run with the following code where the `# load the LLM for generating Natural Language responses` comment is found.
+   - `MODEL_ID = "TheBloke/WizardLM-7B-uncensored-GPTQ"`
+     `MODEL_BASENAME = "WizardLM-7B-uncensored-GPTQ-4bit-128g.compat.no-act-order.safetensors"`
+8. Comment out all other instances of `MODEL_ID="other model names"`, `MODEL_BASENAME=other base model names`, and `llm = load_model(args*)`
+# System Requirements
+## Python Version
+To use this software, you must have Python 3.10 or later installed. Earlier versions of Python will not compile.
+## C++ Compiler
+If you encounter an error while building a wheel during the `pip install` process, you may need to install a C++ compiler on your computer.
+### For Windows 10/11
+To install a C++ compiler on Windows 10/11, follow these steps:
+1. Install Visual Studio 2022.
+2. Make sure the following components are selected:
+   - Universal Windows Platform development
+   - C++ CMake tools for Windows
+3. Download the MinGW installer from the [MinGW website](https://sourceforge.net/projects/mingw/).
+4. Run the installer and select the "gcc" component.
+### NVIDIA Driver's Issues:
+Follow this [page](https://linuxconfig.org/how-to-install-the-nvidia-drivers-on-ubuntu-22-04) to install NVIDIA Drivers.
+## Star History
+[![Star History Chart](https://api.star-history.com/svg?repos=PromtEngineer/localGPT&type=Date)](https://star-history.com/#PromtEngineer/localGPT&Date)
+# Disclaimer
+This is a test project to validate the feasibility of a fully local solution for question answering using LLMs and Vector embeddings. It is not production ready, and it is not meant to be used in production. Vicuna-7B is based on the Llama model so that has the original Llama license.
+# Common Errors
+ - [Torch not compatible with CUDA enabled](https://github.com/pytorch/pytorch/issues/30664)
+   -  Get CUDA version
+      ```shell
+      nvcc --version
+      ```
+      ```shell
+      nvidia-smi
+      ```
+   - Try installing PyTorch depending on your CUDA version
+      ```shell
+         conda install -c pytorch torchvision cudatoolkit=10.1 pytorch
+      ```
+   - If it doesn't work, try reinstalling
+      ```shell
+         pip uninstall torch
+         pip cache purge
+         pip install torch -f https://download.pytorch.org/whl/torch_stable.html
+      ```
+- [ERROR: pip's dependency resolver does not currently take into account all the packages that are installed](https://stackoverflow.com/questions/72672196/error-pips-dependency-resolver-does-not-currently-take-into-account-all-the-pa/76604141#76604141)
+  ```shell
+     pip install h5py
+     pip install typing-extensions
+     pip install wheel
+  ```
+- [Failed to import transformers](https://github.com/huggingface/transformers/issues/11262)
+  - Try re-install
+    ```shell
+       conda uninstall tokenizers, transformers
+       pip install transformers
+    ```

constants.py ADDED Viewed

	@@ -0,0 +1,142 @@

+import os
+# from dotenv import load_dotenv
+from chromadb.config import Settings
+# https://python.langchain.com/en/latest/modules/indexes/document_loaders/examples/excel.html?highlight=xlsx#microsoft-excel
+from langchain.document_loaders import CSVLoader, PDFMinerLoader, TextLoader, UnstructuredExcelLoader, Docx2txtLoader
+# load_dotenv()
+ROOT_DIRECTORY = os.path.dirname(os.path.realpath(__file__))
+# Define the folder for storing database
+SOURCE_DIRECTORY = f"{ROOT_DIRECTORY}/SOURCE_DOCUMENTS"
+PERSIST_DIRECTORY = f"{ROOT_DIRECTORY}/DB"
+# Can be changed to a specific number
+INGEST_THREADS = os.cpu_count() or 8
+# Define the Chroma settings
+CHROMA_SETTINGS = Settings(
+    anonymized_telemetry=False,
+    is_persistent=True,
+)
+# https://python.langchain.com/en/latest/_modules/langchain/document_loaders/excel.html#UnstructuredExcelLoader
+DOCUMENT_MAP = {
+    ".txt": TextLoader,
+    ".md": TextLoader,
+    ".py": TextLoader,
+    ".pdf": PDFMinerLoader,
+    ".csv": CSVLoader,
+    ".xls": UnstructuredExcelLoader,
+    ".xlsx": UnstructuredExcelLoader,
+    ".docx": Docx2txtLoader,
+    ".doc": Docx2txtLoader,
+}
+# Default Instructor Model
+EMBEDDING_MODEL_NAME = "hkunlp/instructor-large" # Uses 1.5 GB of VRAM (High Accuracy with lower VRAM usage)
+####
+#### OTHER EMBEDDING MODEL OPTIONS
+####
+# EMBEDDING_MODEL_NAME = "hkunlp/instructor-xl" # Uses 5 GB of VRAM (Most Accurate of all models)
+# EMBEDDING_MODEL_NAME = "intfloat/e5-large-v2" # Uses 1.5 GB of VRAM (A little less accurate than instructor-large)
+# EMBEDDING_MODEL_NAME = "intfloat/e5-base-v2" # Uses 0.5 GB of VRAM (A good model for lower VRAM GPUs)
+# EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2" # Uses 0.2 GB of VRAM (Less accurate but fastest - only requires 150mb of vram)
+####
+#### MULTILINGUAL EMBEDDING MODELS
+####
+# EMBEDDING_MODEL_NAME = "intfloat/multilingual-e5-large" # Uses 2.5 GB of VRAM
+# EMBEDDING_MODEL_NAME = "intfloat/multilingual-e5-base" # Uses 1.2 GB of VRAM
+#### SELECT AN OPEN SOURCE LLM (LARGE LANGUAGE MODEL)
+    # Select the Model ID and model_basename
+    # load the LLM for generating Natural Language responses
+#### GPU VRAM Memory required for LLM Models (ONLY) by Billion Parameter value (B Model)
+#### Does not include VRAM used by Embedding Models - which use an additional 2GB-7GB of VRAM depending on the model.
+####
+#### (B Model)   (float32)    (float16)    (GPTQ 8bit)         (GPTQ 4bit)
+####    7b         28 GB        14 GB       7 GB - 9 GB        3.5 GB - 5 GB
+####    13b        52 GB        26 GB       13 GB - 15 GB      6.5 GB - 8 GB
+####    32b        130 GB       65 GB       32.5 GB - 35 GB    16.25 GB - 19 GB
+####    65b        260.8 GB     130.4 GB    65.2 GB - 67 GB    32.6 GB -  - 35 GB
+MODEL_ID = "TheBloke/Llama-2-7B-Chat-GGML"
+MODEL_BASENAME = "llama-2-7b-chat.ggmlv3.q4_0.bin"
+####
+#### (FOR HF MODELS)
+####
+# MODEL_ID = "TheBloke/vicuna-7B-1.1-HF"
+# MODEL_BASENAME = None
+# MODEL_ID = "TheBloke/Wizard-Vicuna-7B-Uncensored-HF"
+# MODEL_ID = "TheBloke/guanaco-7B-HF"
+# MODEL_ID = 'NousResearch/Nous-Hermes-13b' # Requires ~ 23GB VRAM. Using STransformers
+# alongside will 100% create OOM on 24GB cards.
+# llm = load_model(device_type, model_id=model_id)
+####
+#### (FOR GPTQ QUANTIZED) Select a llm model based on your GPU and VRAM GB. Does not include Embedding Models VRAM usage.
+####
+##### 48GB VRAM Graphics Cards (RTX 6000, RTX A6000 and other 48GB VRAM GPUs) #####
+### 65b GPTQ LLM Models for 48GB GPUs (*** With best embedding model: hkunlp/instructor-xl ***)
+# model_id = "TheBloke/guanaco-65B-GPTQ"
+# model_basename = "model.safetensors"
+# model_id = "TheBloke/Airoboros-65B-GPT4-2.0-GPTQ"
+# model_basename = "model.safetensors"
+# model_id = "TheBloke/gpt4-alpaca-lora_mlp-65B-GPTQ"
+# model_basename = "model.safetensors"
+# model_id = "TheBloke/Upstage-Llama1-65B-Instruct-GPTQ"
+# model_basename = "model.safetensors"
+##### 24GB VRAM Graphics Cards (RTX 3090 - RTX 4090 (35% Faster) - RTX A5000 - RTX A5500) #####
+### 13b GPTQ Models for 24GB GPUs (*** With best embedding model: hkunlp/instructor-xl ***)
+# model_id = "TheBloke/Wizard-Vicuna-13B-Uncensored-GPTQ"
+# model_basename = "Wizard-Vicuna-13B-Uncensored-GPTQ-4bit-128g.compat.no-act-order.safetensors"
+# model_id = "TheBloke/vicuna-13B-v1.5-GPTQ"
+# model_basename = "model.safetensors"
+# model_id = "TheBloke/Nous-Hermes-13B-GPTQ"
+# model_basename = "nous-hermes-13b-GPTQ-4bit-128g.no-act.order"
+# model_id = "TheBloke/WizardLM-13B-V1.2-GPTQ"
+# model_basename = "gptq_model-4bit-128g.safetensors
+### 30b GPTQ Models for 24GB GPUs (*** Requires using intfloat/e5-base-v2 instead of hkunlp/instructor-large as embedding model ***)
+# model_id = "TheBloke/Wizard-Vicuna-30B-Uncensored-GPTQ"
+# model_basename = "Wizard-Vicuna-30B-Uncensored-GPTQ-4bit--1g.act.order.safetensors"
+# model_id = "TheBloke/WizardLM-30B-Uncensored-GPTQ"
+# model_basename = "WizardLM-30B-Uncensored-GPTQ-4bit.act-order.safetensors"
+##### 8-10GB VRAM Graphics Cards (RTX 3080 - RTX 3080 Ti - RTX 3070 Ti - 3060 Ti - RTX 2000 Series, Quadro RTX 4000, 5000, 6000) #####
+### (*** Requires using intfloat/e5-small-v2 instead of hkunlp/instructor-large as embedding model ***)
+### 7b GPTQ Models for 8GB GPUs
+# model_id = "TheBloke/Wizard-Vicuna-7B-Uncensored-GPTQ"
+# model_basename = "Wizard-Vicuna-7B-Uncensored-GPTQ-4bit-128g.no-act.order.safetensors"
+# model_id = "TheBloke/WizardLM-7B-uncensored-GPTQ"
+# model_basename = "WizardLM-7B-uncensored-GPTQ-4bit-128g.compat.no-act-order.safetensors"
+# model_id = "TheBloke/wizardLM-7B-GPTQ"
+# model_basename = "wizardLM-7B-GPTQ-4bit.compat.no-act-order.safetensors"
+####
+#### (FOR GGML) (Quantized cpu+gpu+mps) models - check if they support llama.cpp
+####
+# MODEL_ID = "TheBloke/wizard-vicuna-13B-GGML"
+# MODEL_BASENAME = "wizard-vicuna-13B.ggmlv3.q4_0.bin"
+# MODEL_BASENAME = "wizard-vicuna-13B.ggmlv3.q6_K.bin"
+# MODEL_BASENAME = "wizard-vicuna-13B.ggmlv3.q2_K.bin"
+# MODEL_ID = "TheBloke/orca_mini_3B-GGML"
+# MODEL_BASENAME = "orca-mini-3b.ggmlv3.q4_0.bin"

ingest.py ADDED Viewed

	@@ -0,0 +1,161 @@

+import logging
+import os
+from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
+import click
+import torch
+from langchain.docstore.document import Document
+from langchain.embeddings import HuggingFaceInstructEmbeddings
+from langchain.text_splitter import Language, RecursiveCharacterTextSplitter
+from langchain.vectorstores import Chroma
+from constants import (
+    CHROMA_SETTINGS,
+    DOCUMENT_MAP,
+    EMBEDDING_MODEL_NAME,
+    INGEST_THREADS,
+    PERSIST_DIRECTORY,
+    SOURCE_DIRECTORY,
+)
+def load_single_document(file_path: str) -> Document:
+    # Loads a single document from a file path
+    file_extension = os.path.splitext(file_path)[1]
+    loader_class = DOCUMENT_MAP.get(file_extension)
+    if loader_class:
+        loader = loader_class(file_path)
+    else:
+        raise ValueError("Document type is undefined")
+    return loader.load()[0]
+def load_document_batch(filepaths):
+    logging.info("Loading document batch")
+    # create a thread pool
+    with ThreadPoolExecutor(len(filepaths)) as exe:
+        # load files
+        futures = [exe.submit(load_single_document, name) for name in filepaths]
+        # collect data
+        data_list = [future.result() for future in futures]
+        # return data and file paths
+        return (data_list, filepaths)
+def load_documents(source_dir: str) -> list[Document]:
+    # Loads all documents from the source documents directory, including nested folders
+    paths = []
+    for root, _, files in os.walk(source_dir):
+        for file_name in files:
+            file_extension = os.path.splitext(file_name)[1]
+            source_file_path = os.path.join(root, file_name)
+            if file_extension in DOCUMENT_MAP.keys():
+                paths.append(source_file_path)
+    # Have at least one worker and at most INGEST_THREADS workers
+    n_workers = min(INGEST_THREADS, max(len(paths), 1))
+    chunksize = round(len(paths) / n_workers)
+    docs = []
+    with ProcessPoolExecutor(n_workers) as executor:
+        futures = []
+        # split the load operations into chunks
+        for i in range(0, len(paths), chunksize):
+            # select a chunk of filenames
+            filepaths = paths[i : (i + chunksize)]
+            # submit the task
+            future = executor.submit(load_document_batch, filepaths)
+            futures.append(future)
+        # process all results
+        for future in as_completed(futures):
+            # open the file and load the data
+            contents, _ = future.result()
+            docs.extend(contents)
+    return docs
+def split_documents(documents: list[Document]) -> tuple[list[Document], list[Document]]:
+    # Splits documents for correct Text Splitter
+    text_docs, python_docs = [], []
+    for doc in documents:
+        file_extension = os.path.splitext(doc.metadata["source"])[1]
+        if file_extension == ".py":
+            python_docs.append(doc)
+        else:
+            text_docs.append(doc)
+    return text_docs, python_docs
+@click.command()
+@click.option(
+    "--device_type",
+    default="cuda" if torch.cuda.is_available() else "cpu",
+    type=click.Choice(
+        [
+            "cpu",
+            "cuda",
+            "ipu",
+            "xpu",
+            "mkldnn",
+            "opengl",
+            "opencl",
+            "ideep",
+            "hip",
+            "ve",
+            "fpga",
+            "ort",
+            "xla",
+            "lazy",
+            "vulkan",
+            "mps",
+            "meta",
+            "hpu",
+            "mtia",
+        ],
+    ),
+    help="Device to run on. (Default is cuda)",
+)
+def main(device_type):
+    # Load documents and split in chunks
+    logging.info(f"Loading documents from {SOURCE_DIRECTORY}")
+    documents = load_documents(SOURCE_DIRECTORY)
+    text_documents, python_documents = split_documents(documents)
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
+    python_splitter = RecursiveCharacterTextSplitter.from_language(
+        language=Language.PYTHON, chunk_size=880, chunk_overlap=200
+    )
+    texts = text_splitter.split_documents(text_documents)
+    texts.extend(python_splitter.split_documents(python_documents))
+    logging.info(f"Loaded {len(documents)} documents from {SOURCE_DIRECTORY}")
+    logging.info(f"Split into {len(texts)} chunks of text")
+    # Create embeddings
+    embeddings = HuggingFaceInstructEmbeddings(
+        model_name=EMBEDDING_MODEL_NAME,
+        model_kwargs={"device": device_type},
+    )
+    # change the embedding type here if you are running into issues.
+    # These are much smaller embeddings and will work for most appications
+    # If you use HuggingFaceEmbeddings, make sure to also use the same in the
+    # run_localGPT.py file.
+    # embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
+    db = Chroma.from_documents(
+        texts,
+        embeddings,
+        persist_directory=PERSIST_DIRECTORY,
+        client_settings=CHROMA_SETTINGS,
+    )
+if __name__ == "__main__":
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(filename)s:%(lineno)s - %(message)s", level=logging.INFO
+    )
+    main()

localGPT_UI.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import torch
+import subprocess
+import streamlit as st
+from run_localGPT import load_model
+from langchain.vectorstores import Chroma
+from constants import CHROMA_SETTINGS, EMBEDDING_MODEL_NAME, PERSIST_DIRECTORY, MODEL_ID, MODEL_BASENAME
+from langchain.embeddings import HuggingFaceInstructEmbeddings
+from langchain.chains import RetrievalQA
+from streamlit_extras.add_vertical_space import add_vertical_space
+from langchain.prompts import PromptTemplate
+from langchain.memory import ConversationBufferMemory
+def model_memory():
+    # Adding history to the model.
+    template = """Use the following pieces of context to answer the question at the end. If you don't know the answer,\
+    just say that you don't know, don't try to make up an answer.
+    {context}
+    {history}
+    Question: {question}
+    Helpful Answer:"""
+    prompt = PromptTemplate(input_variables=["history", "context", "question"], template=template)
+    memory = ConversationBufferMemory(input_key="question", memory_key="history")
+    return prompt, memory
+# Sidebar contents
+with st.sidebar:
+    st.title('🤗💬 Converse with your Data')
+    st.markdown('''
+    ## About
+    This app is an LLM-powered chatbot built using:
+    - [Streamlit](https://streamlit.io/)
+    - [LangChain](https://python.langchain.com/)
+    - [LocalGPT](https://github.com/PromtEngineer/localGPT)
+    ''')
+    add_vertical_space(5)
+    st.write('Made with ❤️ by [Prompt Engineer](https://youtube.com/@engineerprompt)')
+DEVICE_TYPE = "cuda" if torch.cuda.is_available() else "cpu"
+if "result" not in st.session_state:
+    # Run the document ingestion process.
+    run_langest_commands = ["python", "ingest.py"]
+    run_langest_commands.append("--device_type")
+    run_langest_commands.append(DEVICE_TYPE)
+    result = subprocess.run(run_langest_commands, capture_output=True)
+    st.session_state.result = result
+# Define the retreiver
+# load the vectorstore
+if "EMBEDDINGS" not in st.session_state:
+    EMBEDDINGS = HuggingFaceInstructEmbeddings(model_name=EMBEDDING_MODEL_NAME, model_kwargs={"device": DEVICE_TYPE})
+    st.session_state.EMBEDDINGS = EMBEDDINGS
+if "DB" not in st.session_state:
+    DB = Chroma(
+        persist_directory=PERSIST_DIRECTORY,
+        embedding_function=st.session_state.EMBEDDINGS,
+        client_settings=CHROMA_SETTINGS,
+    )
+    st.session_state.DB = DB
+if "RETRIEVER" not in st.session_state:
+    RETRIEVER = DB.as_retriever()
+    st.session_state.RETRIEVER = RETRIEVER
+if "LLM" not in st.session_state:
+    LLM = load_model(device_type=DEVICE_TYPE, model_id=MODEL_ID, model_basename=MODEL_BASENAME)
+    st.session_state["LLM"] = LLM
+if "QA" not in st.session_state:
+    prompt, memory = model_memory()
+    QA = RetrievalQA.from_chain_type(
+        llm=LLM,
+        chain_type="stuff",
+        retriever=RETRIEVER,
+        return_source_documents=True,
+        chain_type_kwargs={"prompt": prompt, "memory": memory},
+    )
+    st.session_state["QA"] = QA
+st.title('LocalGPT App 💬')
+    # Create a text input box for the user
+prompt = st.text_input('Input your prompt here')
+# while True:
+    # If the user hits enter
+if prompt:
+    # Then pass the prompt to the LLM
+    response = st.session_state["QA"](prompt)
+    answer, docs = response["result"], response["source_documents"]
+    # ...and write it out to the screen
+    st.write(answer)
+    # With a streamlit expander
+    with st.expander('Document Similarity Search'):
+        # Find the relevant pages
+        search = st.session_state.DB.similarity_search_with_score(prompt)
+        # Write out the first
+        for i, doc in enumerate(search):
+            # print(doc)
+            st.write(f"Source Document # {i+1} : {doc[0].metadata['source'].split('/')[-1]}")
+            st.write(doc[0].page_content)
+            st.write("--------------------------------")

pyproject.toml ADDED Viewed

	@@ -0,0 +1,15 @@

+# ==== black ====
+[tool.black]
+line-length = 119
+target-version = ['py311']
+# ==== isort ====
+[tool.isort]
+profile = "black"
+line_length = 119
+known_first_party = [
+    "tests",
+    "scripts",
+    "hooks",
+]

requirements.txt ADDED Viewed

	@@ -0,0 +1,32 @@

+# Natural Language Processing
+langchain==0.0.267
+chromadb==0.4.6
+llama-cpp-python==0.1.78
+pdfminer.six==20221105
+InstructorEmbedding
+sentence-transformers
+faiss-cpu
+huggingface_hub
+transformers
+protobuf==3.20.0; sys_platform != 'darwin'
+protobuf==3.20.0; sys_platform == 'darwin' and platform_machine != 'arm64'
+protobuf==3.20.3; sys_platform == 'darwin' and platform_machine == 'arm64'
+auto-gptq==0.2.2
+docx2txt
+unstructured
+# Utilities
+urllib3==1.26.6
+accelerate
+bitsandbytes ; sys_platform != 'win32'
+bitsandbytes-windows ; sys_platform == 'win32'
+click
+flask
+requests
+# Streamlit related
+streamlit
+Streamlit-extras
+# Excel File Manipulation
+openpyxl

run_localGPT.py ADDED Viewed

	@@ -0,0 +1,247 @@

+import logging
+import click
+import torch
+from auto_gptq import AutoGPTQForCausalLM
+from huggingface_hub import hf_hub_download
+from langchain.chains import RetrievalQA
+from langchain.embeddings import HuggingFaceInstructEmbeddings
+from langchain.llms import HuggingFacePipeline, LlamaCpp
+from langchain.memory import ConversationBufferMemory
+from langchain.prompts import PromptTemplate
+# from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
+from langchain.vectorstores import Chroma
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    GenerationConfig,
+    LlamaForCausalLM,
+    LlamaTokenizer,
+    pipeline,
+)
+from constants import EMBEDDING_MODEL_NAME, PERSIST_DIRECTORY, MODEL_ID, MODEL_BASENAME
+def load_model(device_type, model_id, model_basename=None):
+    """
+    Select a model for text generation using the HuggingFace library.
+    If you are running this for the first time, it will download a model for you.
+    subsequent runs will use the model from the disk.
+    Args:
+        device_type (str): Type of device to use, e.g., "cuda" for GPU or "cpu" for CPU.
+        model_id (str): Identifier of the model to load from HuggingFace's model hub.
+        model_basename (str, optional): Basename of the model if using quantized models.
+            Defaults to None.
+    Returns:
+        HuggingFacePipeline: A pipeline object for text generation using the loaded model.
+    Raises:
+        ValueError: If an unsupported model or device type is provided.
+    """
+    logging.info(f"Loading Model: {model_id}, on: {device_type}")
+    logging.info("This action can take a few minutes!")
+    if model_basename is not None:
+        if ".ggml" in model_basename:
+            logging.info("Using Llamacpp for GGML quantized models")
+            model_path = hf_hub_download(repo_id=model_id, filename=model_basename, resume_download=True)
+            max_ctx_size = 2048
+            kwargs = {
+                "model_path": model_path,
+                "n_ctx": max_ctx_size,
+                "max_tokens": max_ctx_size,
+            }
+            if device_type.lower() == "mps":
+                kwargs["n_gpu_layers"] = 1000
+            if device_type.lower() == "cuda":
+                kwargs["n_gpu_layers"] = 1000
+                kwargs["n_batch"] = max_ctx_size
+            return LlamaCpp(**kwargs)
+        else:
+            # The code supports all huggingface models that ends with GPTQ and have some variation
+            # of .no-act.order or .safetensors in their HF repo.
+            logging.info("Using AutoGPTQForCausalLM for quantized models")
+            if ".safetensors" in model_basename:
+                # Remove the ".safetensors" ending if present
+                model_basename = model_basename.replace(".safetensors", "")
+            tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
+            logging.info("Tokenizer loaded")
+            model = AutoGPTQForCausalLM.from_quantized(
+                model_id,
+                model_basename=model_basename,
+                use_safetensors=True,
+                trust_remote_code=True,
+                device="cuda:0",
+                use_triton=False,
+                quantize_config=None,
+            )
+    elif (
+        device_type.lower() == "cuda"
+    ):  # The code supports all huggingface models that ends with -HF or which have a .bin
+        # file in their HF repo.
+        logging.info("Using AutoModelForCausalLM for full models")
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        logging.info("Tokenizer loaded")
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            device_map="auto",
+            torch_dtype=torch.float16,
+            low_cpu_mem_usage=True,
+            trust_remote_code=True,
+            # max_memory={0: "15GB"} # Uncomment this line with you encounter CUDA out of memory errors
+        )
+        model.tie_weights()
+    else:
+        logging.info("Using LlamaTokenizer")
+        tokenizer = LlamaTokenizer.from_pretrained(model_id)
+        model = LlamaForCausalLM.from_pretrained(model_id)
+    # Load configuration from the model to avoid warnings
+    generation_config = GenerationConfig.from_pretrained(model_id)
+    # see here for details:
+    # https://huggingface.co/docs/transformers/
+    # main_classes/text_generation#transformers.GenerationConfig.from_pretrained.returns
+    # Create a pipeline for text generation
+    pipe = pipeline(
+        "text-generation",
+        model=model,
+        tokenizer=tokenizer,
+        max_length=2048,
+        temperature=0,
+        top_p=0.95,
+        repetition_penalty=1.15,
+        generation_config=generation_config,
+    )
+    local_llm = HuggingFacePipeline(pipeline=pipe)
+    logging.info("Local LLM Loaded")
+    return local_llm
+# chose device typ to run on as well as to show source documents.
+@click.command()
+@click.option(
+    "--device_type",
+    default="cuda" if torch.cuda.is_available() else "cpu",
+    type=click.Choice(
+        [
+            "cpu",
+            "cuda",
+            "ipu",
+            "xpu",
+            "mkldnn",
+            "opengl",
+            "opencl",
+            "ideep",
+            "hip",
+            "ve",
+            "fpga",
+            "ort",
+            "xla",
+            "lazy",
+            "vulkan",
+            "mps",
+            "meta",
+            "hpu",
+            "mtia",
+        ],
+    ),
+    help="Device to run on. (Default is cuda)",
+)
+@click.option(
+    "--show_sources",
+    "-s",
+    is_flag=True,
+    help="Show sources along with answers (Default is False)",
+)
+def main(device_type, show_sources):
+    """
+    This function implements the information retrieval task.
+    1. Loads an embedding model, can be HuggingFaceInstructEmbeddings or HuggingFaceEmbeddings
+    2. Loads the existing vectorestore that was created by inget.py
+    3. Loads the local LLM using load_model function - You can now set different LLMs.
+    4. Setup the Question Answer retreival chain.
+    5. Question answers.
+    """
+    logging.info(f"Running on: {device_type}")
+    logging.info(f"Display Source Documents set to: {show_sources}")
+    embeddings = HuggingFaceInstructEmbeddings(model_name=EMBEDDING_MODEL_NAME, model_kwargs={"device": device_type})
+    # uncomment the following line if you used HuggingFaceEmbeddings in the ingest.py
+    # embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
+    # load the vectorstore
+    db = Chroma(
+        persist_directory=PERSIST_DIRECTORY,
+        embedding_function=embeddings,
+    )
+    retriever = db.as_retriever()
+    template = """Use the following pieces of context to answer the question at the end. If you don't know the answer,\
+    just say that you don't know, don't try to make up an answer.
+    {context}
+    {history}
+    Question: {question}
+    Helpful Answer:"""
+    prompt = PromptTemplate(input_variables=["history", "context", "question"], template=template)
+    memory = ConversationBufferMemory(input_key="question", memory_key="history")
+    llm = load_model(device_type, model_id=MODEL_ID, model_basename=MODEL_BASENAME)
+    qa = RetrievalQA.from_chain_type(
+        llm=llm,
+        chain_type="stuff",
+        retriever=retriever,
+        return_source_documents=True,
+        chain_type_kwargs={"prompt": prompt, "memory": memory},
+    )
+    # Interactive questions and answers
+    while True:
+        query = input("\nEnter a query: ")
+        if query == "exit":
+            break
+        # Get the answer from the chain
+        res = qa(query)
+        answer, docs = res["result"], res["source_documents"]
+        # Print the result
+        print("\n\n> Question:")
+        print(query)
+        print("\n> Answer:")
+        print(answer)
+        if show_sources:  # this is a flag that you can set to disable showing answers.
+            # # Print the relevant sources used for the answer
+            print("----------------------------------SOURCE DOCUMENTS---------------------------")
+            for document in docs:
+                print("\n> " + document.metadata["source"] + ":")
+                print(document.page_content)
+            print("----------------------------------SOURCE DOCUMENTS---------------------------")
+if __name__ == "__main__":
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(filename)s:%(lineno)s - %(message)s", level=logging.INFO
+    )
+    main()

run_localGPT_API.py ADDED Viewed

	@@ -0,0 +1,173 @@

+import logging
+import os
+import shutil
+import subprocess
+import torch
+from auto_gptq import AutoGPTQForCausalLM
+from flask import Flask, jsonify, request
+from langchain.chains import RetrievalQA
+from langchain.embeddings import HuggingFaceInstructEmbeddings
+# from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.llms import HuggingFacePipeline
+from run_localGPT import load_model
+# from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
+from langchain.vectorstores import Chroma
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    GenerationConfig,
+    LlamaForCausalLM,
+    LlamaTokenizer,
+    pipeline,
+)
+from werkzeug.utils import secure_filename
+from constants import CHROMA_SETTINGS, EMBEDDING_MODEL_NAME, PERSIST_DIRECTORY, MODEL_ID, MODEL_BASENAME
+DEVICE_TYPE = "cuda" if torch.cuda.is_available() else "cpu"
+SHOW_SOURCES = True
+logging.info(f"Running on: {DEVICE_TYPE}")
+logging.info(f"Display Source Documents set to: {SHOW_SOURCES}")
+EMBEDDINGS = HuggingFaceInstructEmbeddings(model_name=EMBEDDING_MODEL_NAME, model_kwargs={"device": DEVICE_TYPE})
+# uncomment the following line if you used HuggingFaceEmbeddings in the ingest.py
+# EMBEDDINGS = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
+if os.path.exists(PERSIST_DIRECTORY):
+    try:
+        shutil.rmtree(PERSIST_DIRECTORY)
+    except OSError as e:
+        print(f"Error: {e.filename} - {e.strerror}.")
+else:
+    print("The directory does not exist")
+run_langest_commands = ["python", "ingest.py"]
+if DEVICE_TYPE == "cpu":
+    run_langest_commands.append("--device_type")
+    run_langest_commands.append(DEVICE_TYPE)
+result = subprocess.run(run_langest_commands, capture_output=True)
+if result.returncode != 0:
+    raise FileNotFoundError(
+        "No files were found inside SOURCE_DOCUMENTS, please put a starter file inside before starting the API!"
+    )
+# load the vectorstore
+DB = Chroma(
+    persist_directory=PERSIST_DIRECTORY,
+    embedding_function=EMBEDDINGS,
+    client_settings=CHROMA_SETTINGS,
+)
+RETRIEVER = DB.as_retriever()
+LLM = load_model(device_type=DEVICE_TYPE, model_id=MODEL_ID, model_basename=MODEL_BASENAME)
+QA = RetrievalQA.from_chain_type(
+    llm=LLM, chain_type="stuff", retriever=RETRIEVER, return_source_documents=SHOW_SOURCES
+)
+app = Flask(__name__)
+@app.route("/api/delete_source", methods=["GET"])
+def delete_source_route():
+    folder_name = "SOURCE_DOCUMENTS"
+    if os.path.exists(folder_name):
+        shutil.rmtree(folder_name)
+    os.makedirs(folder_name)
+    return jsonify({"message": f"Folder '{folder_name}' successfully deleted and recreated."})
+@app.route("/api/save_document", methods=["GET", "POST"])
+def save_document_route():
+    if "document" not in request.files:
+        return "No document part", 400
+    file = request.files["document"]
+    if file.filename == "":
+        return "No selected file", 400
+    if file:
+        filename = secure_filename(file.filename)
+        folder_path = "SOURCE_DOCUMENTS"
+        if not os.path.exists(folder_path):
+            os.makedirs(folder_path)
+        file_path = os.path.join(folder_path, filename)
+        file.save(file_path)
+        return "File saved successfully", 200
+@app.route("/api/run_ingest", methods=["GET"])
+def run_ingest_route():
+    global DB
+    global RETRIEVER
+    global QA
+    try:
+        if os.path.exists(PERSIST_DIRECTORY):
+            try:
+                shutil.rmtree(PERSIST_DIRECTORY)
+            except OSError as e:
+                print(f"Error: {e.filename} - {e.strerror}.")
+        else:
+            print("The directory does not exist")
+        run_langest_commands = ["python", "ingest.py"]
+        if DEVICE_TYPE == "cpu":
+            run_langest_commands.append("--device_type")
+            run_langest_commands.append(DEVICE_TYPE)
+        result = subprocess.run(run_langest_commands, capture_output=True)
+        if result.returncode != 0:
+            return "Script execution failed: {}".format(result.stderr.decode("utf-8")), 500
+        # load the vectorstore
+        DB = Chroma(
+            persist_directory=PERSIST_DIRECTORY,
+            embedding_function=EMBEDDINGS,
+            client_settings=CHROMA_SETTINGS,
+        )
+        RETRIEVER = DB.as_retriever()
+        QA = RetrievalQA.from_chain_type(
+            llm=LLM, chain_type="stuff", retriever=RETRIEVER, return_source_documents=SHOW_SOURCES
+        )
+        return "Script executed successfully: {}".format(result.stdout.decode("utf-8")), 200
+    except Exception as e:
+        return f"Error occurred: {str(e)}", 500
+@app.route("/api/prompt_route", methods=["GET", "POST"])
+def prompt_route():
+    global QA
+    user_prompt = request.form.get("user_prompt")
+    if user_prompt:
+        # print(f'User Prompt: {user_prompt}')
+        # Get the answer from the chain
+        res = QA(user_prompt)
+        answer, docs = res["result"], res["source_documents"]
+        prompt_response_dict = {
+            "Prompt": user_prompt,
+            "Answer": answer,
+        }
+        prompt_response_dict["Sources"] = []
+        for document in docs:
+            prompt_response_dict["Sources"].append(
+                (os.path.basename(str(document.metadata["source"])), str(document.page_content))
+            )
+        return jsonify(prompt_response_dict), 200
+    else:
+        return "No user prompt received", 400
+if __name__ == "__main__":
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(filename)s:%(lineno)s - %(message)s", level=logging.INFO
+    )
+    app.run(debug=False, port=5110)