Spaces:

theekshana
/

boardpac_chat_app_test

Runtime error

App Files Files Community

Boardpac/theekshanas commited on Sep 5, 2023

Commit

39de480

•

1 Parent(s): 467720e

upload files again

Browse files

Files changed (25) hide show

.env +24 -0
.gitattributes +35 -0
.gitignore +5 -0
LICENSE +201 -0
README.md +157 -0
__pycache__/chroma.cpython-311.pyc +0 -0
__pycache__/chromaDb.cpython-311.pyc +0 -0
__pycache__/config.cpython-311.pyc +0 -0
__pycache__/faissDb.cpython-311.pyc +0 -0
__pycache__/qaPipeline.cpython-311.pyc +0 -0
app.py +179 -0
chromaDb.py +102 -0
config.py +16 -0
dataPipeline.py +144 -0
faissDb.py +34 -0
fileUpload.py +148 -0
qaPipeline.py +110 -0
requirements.txt +16 -0
schema/apiSchema.py +28 -0
ui/__pycache__/htmlTemplates.cpython-311.pyc +0 -0
ui/a.jpg +0 -0
ui/bot1.jpg +0 -0
ui/bot2.webp +0 -0
ui/htmlTemplates.py +51 -0
ui/pdf.jpg +0 -0

.env ADDED Viewed

	@@ -0,0 +1,24 @@

+#embeddings
+EMBEDDINGS_MODEL_NAME=all-MiniLM-L6-v2
+EMBEDDING_CHUNK_SIZE=1000
+EMBEDDING_CHUNK_OVERLAP=150
+#gpt4all
+GPT4ALL_MODEL_PATH=models/ggml-gpt4all-j-v1.3-groovy.bin
+MODEL_N_CTX=1000
+MODEL_N_BATCH=8
+TARGET_SOURCE_CHUNKS=4
+#API token keys
+HUGGINGFACEHUB_API_TOKEN=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
+OPENAI_API_KEY=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
+#api app
+APP_HOST=127.0.0.1
+APP_PORT=8000
+#model verbose
+VERBOSE = True
+ENABLE_HUGGINGFSCE_HUB_MODELS =True
+ENABLE_OPENAI_API_MODELS =True

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+models/
+*.ipynb
+CBSL
+faiss_index/

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md ADDED Viewed

	@@ -0,0 +1,157 @@

+---
+title: Boardpac Chat App Test
+emoji: 😻
+colorFrom: gray
+colorTo: purple
+sdk: streamlit
+sdk_version: 1.26.0
+app_file: app.py
+pinned: false
+license: apache-2.0
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# privateGPT
+Ask questions to your documents without an internet connection, using the power of LLMs. 100% private, no data leaves your execution environment at any point. You can ingest documents and ask questions without an internet connection!
+Built with [LangChain](https://github.com/hwchase17/langchain), [GPT4All](https://github.com/nomic-ai/gpt4all), [LlamaCpp](https://github.com/ggerganov/llama.cpp), [Chroma](https://www.trychroma.com/) and [SentenceTransformers](https://www.sbert.net/).
+<img width="902" alt="demo" src="https://user-images.githubusercontent.com/721666/236942256-985801c9-25b9-48ef-80be-3acbb4575164.png">
+### how to run
+python -m streamlit run app.py
+# Environment Setup
+In order to set your environment up to run the code here, first install all requirements:
+```shell
+pip3 install -r requirements.txt
+```
+Then, download the LLM model and place it in a directory of your choice:
+- LLM: default to [ggml-gpt4all-j-v1.3-groovy.bin](https://gpt4all.io/models/ggml-gpt4all-j-v1.3-groovy.bin). If you prefer a different GPT4All-J compatible model, just download it and reference it in your `.env` file.
+Copy the `example.env` template into `.env`
+```shell
+cp example.env .env
+```
+and edit the variables appropriately in the `.env` file.
+```
+MODEL_TYPE: supports LlamaCpp or GPT4All
+PERSIST_DIRECTORY: is the folder you want your vectorstore in
+MODEL_PATH: Path to your GPT4All or LlamaCpp supported LLM
+MODEL_N_CTX: Maximum token limit for the LLM model
+MODEL_N_BATCH: Number of tokens in the prompt that are fed into the model at a time. Optimal value differs a lot depending on the model (8 works well for GPT4All, and 1024 is better for LlamaCpp)
+EMBEDDINGS_MODEL_NAME: SentenceTransformers embeddings model name (see https://www.sbert.net/docs/pretrained_models.html)
+TARGET_SOURCE_CHUNKS: The amount of chunks (sources) that will be used to answer a question
+```
+Note: because of the way `langchain` loads the `SentenceTransformers` embeddings, the first time you run the script it will require internet connection to download the embeddings model itself.
+## Test dataset
+This repo uses a [state of the union transcript](https://github.com/imartinez/privateGPT/blob/main/source_documents/state_of_the_union.txt) as an example.
+## Instructions for ingesting your own dataset
+Put any and all your files into the `source_documents` directory
+The supported extensions are:
+   - `.csv`: CSV,
+   - `.docx`: Word Document,
+   - `.doc`: Word Document,
+   - `.enex`: EverNote,
+   - `.eml`: Email,
+   - `.epub`: EPub,
+   - `.html`: HTML File,
+   - `.md`: Markdown,
+   - `.msg`: Outlook Message,
+   - `.odt`: Open Document Text,
+   - `.pdf`: Portable Document Format (PDF),
+   - `.pptx` : PowerPoint Document,
+   - `.ppt` : PowerPoint Document,
+   - `.txt`: Text file (UTF-8),
+Run the following command to ingest all the data.
+```shell
+python ingest.py
+```
+Output should look like this:
+```shell
+Creating new vectorstore
+Loading documents from source_documents
+Loading new documents: 100%|██████████████████████| 1/1 [00:01<00:00,  1.73s/it]
+Loaded 1 new documents from source_documents
+Split into 90 chunks of text (max. 500 tokens each)
+Creating embeddings. May take some minutes...
+Using embedded DuckDB with persistence: data will be stored in: db
+Ingestion complete! You can now run privateGPT.py to query your documents
+```
+It will create a `db` folder containing the local vectorstore. Will take 20-30 seconds per document, depending on the size of the document.
+You can ingest as many documents as you want, and all will be accumulated in the local embeddings database.
+If you want to start from an empty database, delete the `db` folder.
+Note: during the ingest process no data leaves your local environment. You could ingest without an internet connection, except for the first time you run the ingest script, when the embeddings model is downloaded.
+## Ask questions to your documents, locally!
+In order to ask a question, run a command like:
+```shell
+python privateGPT.py
+```
+And wait for the script to require your input.
+```plaintext
+> Enter a query:
+```
+Hit enter. You'll need to wait 20-30 seconds (depending on your machine) while the LLM model consumes the prompt and prepares the answer. Once done, it will print the answer and the 4 sources it used as context from your documents; you can then ask another question without re-running the script, just wait for the prompt again.
+Note: you could turn off your internet connection, and the script inference would still work. No data gets out of your local environment.
+Type `exit` to finish the script.
+### CLI
+The script also supports optional command-line arguments to modify its behavior. You can see a full list of these arguments by running the command ```python privateGPT.py --help``` in your terminal.
+# How does it work?
+Selecting the right local models and the power of `LangChain` you can run the entire pipeline locally, without any data leaving your environment, and with reasonable performance.
+- `ingest.py` uses `LangChain` tools to parse the document and create embeddings locally using `HuggingFaceEmbeddings` (`SentenceTransformers`). It then stores the result in a local vector database using `Chroma` vector store.
+- `privateGPT.py` uses a local LLM based on `GPT4All-J` or `LlamaCpp` to understand questions and create answers. The context for the answers is extracted from the local vector store using a similarity search to locate the right piece of context from the docs.
+- `GPT4All-J` wrapper was introduced in LangChain 0.0.162.
+# System Requirements
+## Python Version
+To use this software, you must have Python 3.10 or later installed. Earlier versions of Python will not compile.
+## C++ Compiler
+If you encounter an error while building a wheel during the `pip install` process, you may need to install a C++ compiler on your computer.
+### For Windows 10/11
+To install a C++ compiler on Windows 10/11, follow these steps:
+1. Install Visual Studio 2022.
+2. Make sure the following components are selected:
+   * Universal Windows Platform development
+   * C++ CMake tools for Windows
+3. Download the MinGW installer from the [MinGW website](https://sourceforge.net/projects/mingw/).
+4. Run the installer and select the `gcc` component.
+## Mac Running Intel
+When running a Mac with Intel hardware (not M1), you may run into _clang: error: the clang compiler does not support '-march=native'_ during pip install.
+If so set your archflags during pip install. eg: _ARCHFLAGS="-arch x86_64" pip3 install -r requirements.txt_
+# Disclaimer
+This is a test project to validate the feasibility of a fully private solution for question answering using LLMs and Vector embeddings. It is not production ready, and it is not meant to be used in production. The models selection is not optimized for performance, but for privacy; but it is possible to use different models and vectorstores to improve performance.

__pycache__/chroma.cpython-311.pyc ADDED Viewed

Binary file (5.25 kB). View file

__pycache__/chromaDb.cpython-311.pyc ADDED Viewed

Binary file (5.25 kB). View file

__pycache__/config.cpython-311.pyc ADDED Viewed

Binary file (436 Bytes). View file

__pycache__/faissDb.cpython-311.pyc ADDED Viewed

Binary file (1.94 kB). View file

__pycache__/qaPipeline.cpython-311.pyc ADDED Viewed

Binary file (5.06 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,179 @@

+"""
+Python Backend API to chat with private data
+08/16/2023
+D.M. Theekshana Samaradiwakara
+"""
+import os
+import streamlit as st
+from streamlit.logger import get_logger
+logger = get_logger(__name__)
+from ui.htmlTemplates import css, bot_template, user_template, source_template
+from config import MODELS, DATASETS
+from qaPipeline import QAPipeline
+from faissDb import create_faiss
+# loads environment variables
+from dotenv import load_dotenv
+load_dotenv()
+isHuggingFaceHubEnabled = os.environ.get('ENABLE_HUGGINGFSCE_HUB_MODELS')
+isOpenAiApiEnabled = os.environ.get('ENABLE_OPENAI_API_MODELS')
+qaPipeline = QAPipeline()
+def initialize_session_state():
+    # Initialise all session state variables with defaults
+    SESSION_DEFAULTS = {
+        "model": MODELS["DEFAULT"],
+        "dataset": DATASETS["DEFAULT"],
+        "chat_history": None,
+        "is_parameters_changed":False,
+        "show_source_files": False
+    }
+    for k, v in SESSION_DEFAULTS.items():
+        if k not in st.session_state:
+            st.session_state[k] = v
+def main():
+    st.set_page_config(page_title="Chat with data",
+                       page_icon=":books:")
+    st.write(css, unsafe_allow_html=True)
+    initialize_session_state()
+    st.header("Chat with your own data:")
+    user_question = st.text_input(
+        "Ask a question about your documents:",
+        placeholder="enter question",
+    )
+    # Interactive questions and answers
+    if user_question:
+        with st.spinner("Processing"):
+            handle_userinput(user_question)
+    with st.sidebar:
+        st.subheader("Chat parameters")
+        chat_model = st.selectbox(
+            "Chat model",
+            MODELS,
+            key="chat_model",
+            help="Select the LLM model for the chat",
+            on_change=update_parameters_change,
+        )
+        # data_source = st.selectbox(
+        #     "dataset",
+        #     DATASETS,
+        #     key="data_source",
+        #     help="Select the private data_source for the chat",
+        #     on_change=update_parameters_change,
+        # )
+        st.session_state.dataset =  "DEFAULT"
+        show_source = st.checkbox(
+            label="show source files",
+            key="show_source",
+            help="Select this to show relavant source files for the query",
+            on_change=update_parameters_change,
+        )
+        if st.session_state.is_parameters_changed:
+            if st.button("Update"):
+                st.session_state.model = chat_model
+                st.session_state.dataset = "DEFAULT"
+                st.session_state.show_source_files = show_source
+                st.success("done")
+                st.session_state.is_parameters_changed = False
+                return
+        st.markdown("\n")
+        if st.button("Create FAISS db"):
+            with st.spinner('creating faiss vector store'):
+                create_faiss()
+            st.success('faiss saved')
+        st.markdown(
+            "### How to use\n"
+            "1. Select the chat model\n"  # noqa: E501
+            "2. Select \"show source files\" to show the source files related to the answer.📄\n"
+            "3. Ask a question about the documents💬\n"
+        )
+def update_parameters_change():
+    st.session_state.is_parameters_changed = True
+def get_answer_from_backend(query, model, dataset):
+    response = qaPipeline.run(query=query, model=model, dataset=dataset)
+    return response
+def show_query_response(query, response, show_source_files):
+    answer, docs = response['result'], response['source_documents']
+    st.write(user_template.replace(
+                "{{MSG}}", query), unsafe_allow_html=True)
+    st.write(bot_template.replace(
+                "{{MSG}}",  answer ), unsafe_allow_html=True)
+    if show_source_files:
+        # st.write(source_template.replace(
+        #         "{{MSG}}",  "source files" ), unsafe_allow_html=True)
+        st.markdown("#### source files : ")
+        for source in docs:
+            # st.info(source.metadata)
+            with st.expander(source.metadata["source"]):
+                st.markdown(source.page_content)
+        # st.write(response)
+def is_query_valid(query: str) -> bool:
+    if (not query) or (query.strip() == ''):
+        st.error("Please enter a question!")
+        return False
+    return True
+def handle_userinput(query):
+    # Get the answer from the chain
+    try:
+        if not is_query_valid(query):
+            st.stop()
+        model = MODELS[st.session_state.model]
+        dataset = DATASETS[st.session_state.dataset]
+        show_source_files = st.session_state.show_source_files
+        # Try to access openai and deeplake
+        print(f">\n model: {model} \n dataset : {dataset} \n show_source_files : {show_source_files}")
+        response = get_answer_from_backend(query, model, dataset)
+        show_query_response(query, response, show_source_files)
+    except Exception as e:
+        # logger.error(f"Answer retrieval failed with {e}")
+        st.error(f"Error : {e}")#, icon=":books:")
+        return
+if __name__ == "__main__":
+    main()

chromaDb.py ADDED Viewed

	@@ -0,0 +1,102 @@

+"""
+Python Backend API to chat with private data
+08/14/2023
+D.M. Theekshana Samaradiwakara
+"""
+import os
+from dotenv import load_dotenv
+import glob
+import torch
+import pickle
+import io
+from langchain.vectorstores import Chroma
+from langchain.vectorstores import FAISS
+from langchain.embeddings import HuggingFaceEmbeddings
+from chromadb.config import Settings
+load_dotenv()
+import streamlit as st
+embeddings_model_name = os.environ.get("EMBEDDINGS_MODEL_NAME")
+embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)
+def does_chroma_vectorstore_exist(persist_directory: str) -> bool:
+    # Checks if vectorstore exists
+    if os.path.exists(os.path.join(persist_directory, 'index')):
+        if os.path.exists(os.path.join(persist_directory, 'chroma-collections.parquet')) and os.path.exists(os.path.join(persist_directory, 'chroma-embeddings.parquet')):
+            list_index_files = glob.glob(os.path.join(persist_directory, 'index/*.bin'))
+            list_index_files += glob.glob(os.path.join(persist_directory, 'index/*.pkl'))
+            # At least 3 documents are needed in a working vectorstore
+            if len(list_index_files) > 3:
+                return True
+    return False
+def load_store(directory: str) -> Chroma:
+    index_path = "data/{0}".format(directory)
+    # index_exists = os.path.exists(index_path)
+    index_exists = does_chroma_vectorstore_exist(index_path)
+    if index_exists:
+        try:
+            CHROMA_SETTINGS = Settings(
+                    chroma_db_impl='duckdb+parquet',
+                    persist_directory=index_path,
+                    anonymized_telemetry=False
+            )
+            # return Chroma.load(index_path)
+            vectorstore= Chroma(
+                persist_directory=index_path,
+                embedding_function=embeddings,
+                client_settings=CHROMA_SETTINGS
+            )
+            # with open("vectorstore.pkl", "wb") as f:
+            #     pickle.dump(vectorstore, f)
+            return vectorstore
+        except Exception as e:
+             raise Exception(f"Error loading vector store: {e} ")
+    else:
+        # raise exception if model_type is not supported
+        raise Exception(f"A vector store in directory {directory} is not created. Please choose a valid one")
+class CPU_Unpickler(pickle.Unpickler):
+    def find_class(self, module, name):
+        if module == 'torch.storage' and name == '_load_from_bytes':
+            return lambda b: torch.load(io.BytesIO(b), map_location='cpu')
+        else:
+            return super().find_class(module, name)
+def create_db(document_splits,persist_directory):
+    return Chroma.from_documents(
+        documents=document_splits,
+        embedding=embeddings,
+        persist_directory=persist_directory
+    )
+def save_files(persist_directory, document_splits):
+    print(f"Saving document splits...")
+    if does_chroma_vectorstore_exist(persist_directory):
+        print(f"Updating esisting vector store. May take some minutes...")
+        #update function
+        db =  Chroma(
+            persist_directory=index_path,
+            embedding_function=embeddings,
+        )
+        db.aadd_documents(document_splits)
+    else:
+        print(f"Creating new vector store. May take some minutes...")
+        index_path = "data/{0}".format(persist_directory)
+        db = create_db(document_splits,index_path)
+    db.persist()

config.py ADDED Viewed

	@@ -0,0 +1,16 @@

+MODELS={
+    "DEFAULT":"tiiuae/falcon-7b-instruct",
+    "gpt4all":"gpt4all",
+    "flan-t5-xxl":"google/flan-t5-xxl",
+    "falcon-7b-instruct":"tiiuae/falcon-7b-instruct",
+    "openai gpt-3.5":"openai",
+}
+DATASETS={
+    "DEFAULT":"chroma_txt",
+    "a":"A",
+    "b":"B",
+    "c":"C"
+}

dataPipeline.py ADDED Viewed

	@@ -0,0 +1,144 @@

+"""
+Python Backend API to chat with private data
+08/15/2023
+D.M. Theekshana Samaradiwakara
+"""
+import os
+import time
+import glob
+from multiprocessing import Pool
+from tqdm import tqdm
+from dotenv import load_dotenv
+from chromaDb import save_files
+from langchain.document_loaders import (
+    CSVLoader,
+    EverNoteLoader,
+    PyMuPDFLoader,
+    TextLoader,
+    UnstructuredEmailLoader,
+    UnstructuredEPubLoader,
+    UnstructuredHTMLLoader,
+    UnstructuredMarkdownLoader,
+    UnstructuredODTLoader,
+    UnstructuredPowerPointLoader,
+    UnstructuredWordDocumentLoader,
+)
+from langchain.document_loaders import DirectoryLoader
+text_loader_kwargs={'autodetect_encoding': True}
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.docstore.document import Document
+from chroma import load_store
+load_dotenv()
+chunk_size = os.environ.get('EMBEDDING_CHUNK_SIZE')
+chunk_overlap = os.environ.get('EMBEDDING_CHUNK_OVERLAP')
+embeddings_model_name = os.environ.get("EMBEDDINGS_MODEL_NAME")
+# Map file extensions to document loaders and their arguments
+LOADER_MAPPING = {
+    ".csv": (CSVLoader, {}),
+    # ".docx": (Docx2txtLoader, {}),
+    ".doc": (UnstructuredWordDocumentLoader, {}),
+    ".docx": (UnstructuredWordDocumentLoader, {}),
+    ".enex": (EverNoteLoader, {}),
+    ".eml": (UnstructuredEmailLoader, {}),
+    ".epub": (UnstructuredEPubLoader, {}),
+    ".html": (UnstructuredHTMLLoader, {}),
+    ".md": (UnstructuredMarkdownLoader, {}),
+    ".odt": (UnstructuredODTLoader, {}),
+    ".pdf": (PyMuPDFLoader, {}),
+    ".ppt": (UnstructuredPowerPointLoader, {}),
+    ".pptx": (UnstructuredPowerPointLoader, {}),
+    ".txt": (TextLoader, {"encoding": "utf8"}),
+    # Add more mappings for other file extensions and loaders as needed
+}
+class DataPipeline:
+    def __init__(self):
+        self.dataset_name = None
+        self.vectorstore = None
+    def load_documents_in_folder(self, folder):
+        print(f"loading documents...")
+        loader = DirectoryLoader(folder, glob="**/[!.]*", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
+        pages = loader.load()
+        return pages
+    def load_single_document(self, doc):
+        ext = "." + doc.name.rsplit(".", 1)[-1]
+        if ext in LOADER_MAPPING:
+            loader_class, loader_args = LOADER_MAPPING[ext]
+            loader = loader_class(doc, **loader_args)
+            return loader.load()
+        raise ValueError(f"Unsupported file extension '{ext}'")
+    def load_documents(self, uploaded_files):
+        with Pool(processes=os.cpu_count()) as pool:
+            results = []
+            with tqdm(total=len(uploaded_files), desc='Loading new documents', ncols=80) as pbar:
+                for i, docs in enumerate(pool.imap_unordered(self.load_single_document, uploaded_files)):
+                    results.extend(docs)
+                    pbar.update()
+        return results
+    def load_streamlit_documents(self, uploaded_files, year):
+        documents = []
+        for uploaded_file in uploaded_files:
+            print(print("\n\n uploaded_file \n\n",uploaded_file,"\n"))
+            source = uploaded_file.name
+            print(print("\n\n source \n\n",source,"\n"))
+            content = uploaded_file.read().decode('latin-1')
+            print(print("\n\n content \n\n",content[:10],"\n"))
+            doc =  Document(
+                page_content=content,
+                metadata={
+                    "source": source,
+                    'year': year
+                }
+            )
+            print(print("doc"))
+            print(print("\n doc \n\n",doc,"\n\n\n\n"))
+            documents.append(doc)
+        return documents
+    def process_documents(self, documents):
+        print(f"Creating embeddings. May take some minutes...")
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=chunk_size,
+            chunk_overlap=chunk_overlap,
+            separators=["\n\n", "\n", "(?<=\. )", " ", ""]
+        )
+        texts = text_splitter.split_documents(documents)
+        return texts
+    def persist_documents(self, persist_directory, document_splits):
+        save_files(persist_directory, document_splits)
+    def add_metadata(self, documents, metadata, value):
+        for doc in documents:
+            doc.metadata[metadata]=value
+        return documents

faissDb.py ADDED Viewed

	@@ -0,0 +1,34 @@

+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.docstore.document import Document
+from langchain.document_loaders import PyPDFLoader
+from langchain.document_loaders import TextLoader
+from langchain.document_loaders import DirectoryLoader
+from langchain.vectorstores.faiss import FAISS
+EMBEDDINGS_MODEL_NAME="all-MiniLM-L6-v2"
+embeddings_model_name =EMBEDDINGS_MODEL_NAME
+embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)
+persist_directory = "data/cbsl"
+index_path = persist_directory
+chunk_size=1000
+chunk_overlap=50
+def create_faiss():
+    # documents = DirectoryLoader(persist_directory,  loader_cls=PyMuPDFLoader).load()
+    documents = DirectoryLoader("CBSL",  loader_cls=PyPDFLoader).load()
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+    texts = text_splitter.split_documents(documents)
+    embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)
+    vectorstore = FAISS.from_documents(texts, embeddings)
+    vectorstore.save_local("faiss_index")
+def load_FAISS_store():
+    return FAISS.load_local("faiss_index", embeddings)

fileUpload.py ADDED Viewed

	@@ -0,0 +1,148 @@

+"""
+Python Backend API to chat with private data
+08/17/2023
+D.M. Theekshana Samaradiwakara
+"""
+import os
+import streamlit as st
+from streamlit.logger import get_logger
+from io import StringIO
+logger = get_logger(__name__)
+from dataPipeline import DataPipeline
+def initialize_session_state():
+    # Initialise all session state variables with defaults
+    SESSION_DEFAULTS = {
+        "data_index": None,
+        "published_year": 2023,
+        "is_parameters_changed":False,
+        "is_input_validated":False,
+    }
+    for k, v in SESSION_DEFAULTS.items():
+        if k not in st.session_state:
+            st.session_state[k] = v
+def update_parameters_change():
+    st.session_state.is_parameters_changed = True
+def validate_index():
+    index = st.session_state.data_index
+    if (not index) or  (not index.strip()):
+        st.error("Empty index directory name!")
+        st.stop()
+    st.info(f"file persist directory name: {index}")
+def validate_files(uploaded_file):
+    if not uploaded_file:
+        st.error("No uploaded files to process!")
+        st.stop()
+    st.info(f"No of files uploaded       : {len(uploaded_file)}")
+def validate_published_year():
+    if not st.session_state.published_year:
+        st.error("Invalid year!")
+        st.stop()
+    st.info(f"file published year        : {st.session_state.published_year}")
+def validate_inputs(uploaded_file):
+    validate_index()
+    validate_published_year()
+    validate_files(uploaded_file)
+    return True
+def process_files(uploaded_files, data_index):
+    try:
+        st.info(uploaded_files)
+        dataPipe = DataPipeline()
+        documents = dataPipe.load_streamlit_documents(uploaded_files, st.session_state.published_year)
+        # documents = dataPipe.add_metadata(documents, "year", st.session_state.published_year)
+        # process_docs = dataPipe.process_documents(documents)
+        # st.success("files successfully processed!")
+        # dataPipe.persist_documents(data_index, process_docs)
+        # st.success("files successfully stored!")
+    except Exception as e:
+        st.error(str(e))
+#sidebar function
+def sidebar():
+    with st.sidebar:
+        st.subheader("Data indexing parameters")
+        persist_index_name = st.text_input(
+            label="file persist directory name",
+            placeholder="enter index name",
+            key="persist_index_name",
+            help="name of the directory which processed files need to persisted.",
+            on_change=update_parameters_change,
+        )
+        publish_year = st.number_input(
+            label="published year",
+            min_value=1950,
+            value=2023,
+            max_value=2025,
+            key="publish_year",
+            help="year of the files are published.",
+            on_change=update_parameters_change,
+        )
+        if st.session_state.is_parameters_changed:
+            st.session_state.data_index = persist_index_name
+            st.session_state.published_year = publish_year
+            st.session_state.is_parameters_changed = False
+            st.info(f"file persist directory name: {st.session_state.data_index}")
+            st.info(f"file published year        : {st.session_state.published_year}")
+#main function
+def main():
+    st.set_page_config(page_title="upload files to databse", page_icon="📖")#, layout="wide")
+    st.header("📖Boardpac chat App")
+    initialize_session_state()
+    sidebar()
+    uploaded_file = st.file_uploader(
+        "Upload your filess here and click on 'Process'",
+        key = "uploaded_file",
+        accept_multiple_files=True,
+        help="Upload files here!",
+    )
+    col1, col2 = st.columns(2)
+    with col1:
+        if st.button("validate"):
+            if validate_inputs(uploaded_file):
+                st.session_state.is_input_validated=True
+    with col2:
+        if st.session_state.is_input_validated:
+            if st.button("process"):
+                with st.spinner("Indexing document... This may take a while⏳"):
+                    process_files(uploaded_file,st.session_state.data_index)
+                    uploaded_file = None
+                    st.session_state.is_input_validated = False
+if __name__ == "__main__":
+    main()

qaPipeline.py ADDED Viewed

	@@ -0,0 +1,110 @@

+"""
+Python Backend API to chat with private data
+08/14/2023
+D.M. Theekshana Samaradiwakara
+"""
+import os
+import time
+from dotenv import load_dotenv
+from langchain.chains import RetrievalQA
+from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
+from langchain.llms import GPT4All
+from langchain.llms import HuggingFaceHub
+from langchain.chat_models import ChatOpenAI
+# from langchain.retrievers.self_query.base import SelfQueryRetriever
+# from langchain.chains.query_constructor.base import AttributeInfo
+# from chromaDb import load_store
+from faissDb import load_FAISS_store
+load_dotenv()
+#gpt4 all model
+gpt4all_model_path = os.environ.get('GPT4ALL_MODEL_PATH')
+model_n_ctx = os.environ.get('MODEL_N_CTX')
+model_n_batch = int(os.environ.get('MODEL_N_BATCH',8))
+target_source_chunks = int(os.environ.get('TARGET_SOURCE_CHUNKS',4))
+openai_api_key = os.environ.get('OPENAI_API_KEY')
+verbose = os.environ.get('VERBOSE')
+# activate/deactivate the streaming StdOut callback for LLMs
+callbacks = [StreamingStdOutCallbackHandler()]
+class QAPipeline:
+    def __init__(self):
+        self.llm_name = None
+        self.llm = None
+        self.dataset_name = None
+        self.vectorstore = None
+        self.qa_chain = None
+    def run(self,query, model, dataset):
+        if (self.llm_name != model) or (self.dataset_name != dataset) or (self.qa_chain == None):
+            self.set_model(model)
+            self.set_vectorstore(dataset)
+            self.set_qa_chain()
+        # Get the answer from the chain
+        start = time.time()
+        res = self.qa_chain(query)
+        # answer, docs = res['result'],res['source_documents']
+        end = time.time()
+        # Print the result
+        print("\n\n> Question:")
+        print(query)
+        print(f"\n> Answer (took {round(end - start, 2)} s.):")
+        print( res)
+        return res
+    def set_model(self,model_type):
+        if model_type != self.llm_name:
+            match model_type:
+                case "gpt4all":
+                    # self.llm = GPT4All(model=gpt4all_model_path, n_ctx=model_n_ctx, backend='gptj', n_batch=model_n_batch, callbacks=callbacks, verbose=verbose)
+                    self.llm = GPT4All(model=gpt4all_model_path, max_tokens=model_n_ctx, backend='gptj', n_batch=model_n_batch, callbacks=callbacks, verbose=verbose)
+                    # self.llm = HuggingFaceHub(repo_id="nomic-ai/gpt4all-j", model_kwargs={"temperature":0.001, "max_length":1024})
+                case "google/flan-t5-xxl":
+                    self.llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.001, "max_length":1024})
+                case "tiiuae/falcon-7b-instruct":
+                    self.llm = HuggingFaceHub(repo_id=model_type, model_kwargs={"temperature":0.001, "max_length":1024})
+                case "openai":
+                    self.llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
+                case _default:
+                    # raise exception if model_type is not supported
+                    raise Exception(f"Model type {model_type} is not supported. Please choose a valid one")
+            self.llm_name = model_type
+    def set_vectorstore(self, dataset):
+        if dataset != self.dataset_name:
+            # self.vectorstore = load_store(dataset)
+            self.vectorstore = load_FAISS_store()
+            print("\n\n> vectorstore loaded:")
+            self.dataset_name = dataset
+    def set_qa_chain(self):
+        self.qa_chain = RetrievalQA.from_chain_type(
+            llm=self.llm,
+            chain_type="stuff",
+            retriever = self.vectorstore.as_retriever(),
+            # retriever = self.vectorstore.as_retriever(search_kwargs={"k": target_source_chunks}
+            return_source_documents= True
+        )

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+langchain
+openai
+gpt4all
+chromadb
+duckdb
+torch
+faiss-cpu
+streamlit
+# uncomment to use huggingface llms
+huggingface-hub
+sentence_transformers
+python-dotenv

schema/apiSchema.py ADDED Viewed

	@@ -0,0 +1,28 @@

+"""
+Python Backend API to chat with private data
+08/14/2023
+D.M. Theekshana Samaradiwakara
+"""
+from typing import Optional, List, Any, Dict
+from pydantic import BaseModel
+class Document(BaseModel):
+    name: Optional[str]
+    page_content: str
+    metadata: Dict[str, Any]
+class QueryModel(BaseModel):
+    model: str
+    dataset: str
+    question: str
+    history: list = None
+class ResponseModel(BaseModel):
+    success: str = None
+    error: str = None
+    documents: List[Document] # = None

ui/__pycache__/htmlTemplates.cpython-311.pyc ADDED Viewed

Binary file (1.42 kB). View file

ui/a.jpg ADDED Viewed

ui/bot1.jpg ADDED Viewed

ui/bot2.webp ADDED Viewed

ui/htmlTemplates.py ADDED Viewed

	@@ -0,0 +1,51 @@

+css = '''
+<style>
+.chat-message {
+    padding: 1.5rem; border-radius: 0.5rem; margin-bottom: 1rem; display: flex
+}
+.chat-message.user {
+    background-color: #2b313e
+}
+.chat-message.bot {
+    background-color: #475063
+}
+.chat-message .avatar {
+  width: 20%;
+}
+.chat-message .avatar img {
+  max-width: 78px;
+  max-height: 78px;
+  border-radius: 50%;
+  object-fit: cover;
+}
+.chat-message .message {
+  width: 80%;
+  padding: 0 1.5rem;
+  color: #fff;
+}
+'''
+bot_template = '''
+<div class="chat-message bot">
+    <div class="avatar">
+        <img src="https://as2.ftcdn.net/v2/jpg/05/56/09/81/1000_F_556098117_GdiFN9p9j89dpt3JhLJsegV76tt1NhfA.jpg">
+    </div>
+    <div class="message">{{MSG}}</div>
+</div>
+'''
+user_template = '''
+<div class="chat-message user">
+    <div class="avatar">
+        <img src="https://coursera-profile-photos.s3.amazonaws.com/2a/f80e20d0fe4e628036656d2ec2b85b/a.jpg">
+    </div>
+    <div class="message">{{MSG}}</div>
+</div>
+'''
+source_template = '''
+<div class="chat-message bot">
+ <div class="avatar">
+        <img src="https://st.depositphotos.com/1427101/4468/v/950/depositphotos_44680417-stock-illustration-pdf-paper-sheet-icons.jpg">
+    </div>
+    <div class="message">{{MSG}}</div>
+</div>
+'''

ui/pdf.jpg ADDED Viewed