ali121300 commited on
Commit
a30daff
1 Parent(s): 1f516bf

Upload 8 files

Browse files
Files changed (8) hide show
  1. .gitignore +160 -0
  2. Dockerfile +20 -0
  3. LICENSE +21 -0
  4. README.md +30 -12
  5. chatbot_app.py +171 -0
  6. constants.py +8 -0
  7. ingest.py +32 -0
  8. requirements.txt +21 -0
.gitignore ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/#use-with-ide
110
+ .pdm.toml
111
+
112
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113
+ __pypackages__/
114
+
115
+ # Celery stuff
116
+ celerybeat-schedule
117
+ celerybeat.pid
118
+
119
+ # SageMath parsed files
120
+ *.sage.py
121
+
122
+ # Environments
123
+ .env
124
+ .venv
125
+ env/
126
+ venv/
127
+ ENV/
128
+ env.bak/
129
+ venv.bak/
130
+
131
+ # Spyder project settings
132
+ .spyderproject
133
+ .spyproject
134
+
135
+ # Rope project settings
136
+ .ropeproject
137
+
138
+ # mkdocs documentation
139
+ /site
140
+
141
+ # mypy
142
+ .mypy_cache/
143
+ .dmypy.json
144
+ dmypy.json
145
+
146
+ # Pyre type checker
147
+ .pyre/
148
+
149
+ # pytype static type analyzer
150
+ .pytype/
151
+
152
+ # Cython debug symbols
153
+ cython_debug/
154
+
155
+ # PyCharm
156
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
159
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
+ #.idea/
Dockerfile ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use the official Python image
2
+ FROM python:3.10
3
+
4
+ # Set the working directory inside the container
5
+ WORKDIR /app
6
+
7
+ # Copy the requirements.txt file first to leverage Docker cache
8
+ COPY requirements.txt .
9
+
10
+ # Install required Python packages
11
+ RUN pip install -r requirements.txt --default-timeout=100 future
12
+
13
+ # Copy the rest of the application files to the container's working directory
14
+ COPY . .
15
+
16
+ # Expose the port that Streamlit will run on
17
+ EXPOSE 8501
18
+
19
+ # Command to run your Streamlit application
20
+ CMD ["streamlit", "run", "chatbot_app.py"]
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 AI Anytime
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,12 +1,30 @@
1
- ---
2
- title: Local GPT 2
3
- emoji: 🦀
4
- colorFrom: blue
5
- colorTo: gray
6
- sdk: streamlit
7
- sdk_version: 1.31.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Chat-with-PDF-Chatbot
2
+ This Chatbot is an interactive app developed to assist users to interact with their PDF. It is built using Open Source Stack. No OpenAI is required.
3
+
4
+ ## Getting Started
5
+
6
+ Follow these steps to set up and run the project on your local machine.
7
+
8
+
9
+ ### Installation
10
+
11
+ ```sh
12
+ ## Clone the repository
13
+ git clone <repository_url>
14
+
15
+ ## Create the necessary folders
16
+ mkdir db
17
+ mkdir models
18
+ ## Add your model files to the 'models' folder
19
+ mkdir docs
20
+
21
+ ----
22
+ ### Usage
23
+
24
+ ## Run the ingestion script to prepare the data
25
+
26
+ `python ingest.py`
27
+
28
+ ## Start the chatbot application using Streamlit
29
+
30
+ `streamlit run chatbot_app.py`
chatbot_app.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import base64
4
+ import time
5
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
6
+ from transformers import pipeline
7
+ import torch
8
+ import textwrap
9
+ from langchain.document_loaders import PyPDFLoader, DirectoryLoader, PDFMinerLoader
10
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
11
+ from langchain.embeddings import SentenceTransformerEmbeddings
12
+ from langchain.vectorstores import Chroma
13
+ from langchain.llms import HuggingFacePipeline
14
+ from langchain.chains import RetrievalQA
15
+ from constants import CHROMA_SETTINGS
16
+ from streamlit_chat import message
17
+
18
+ st.set_page_config(layout="wide")
19
+
20
+ device = torch.device('cpu')
21
+
22
+ checkpoint = "MBZUAI/LaMini-T5-738M"
23
+ print(f"Checkpoint path: {checkpoint}") # Add this line for debugging
24
+ tokenizer = AutoTokenizer.from_pretrained(checkpoint)
25
+ base_model = AutoModelForSeq2SeqLM.from_pretrained(
26
+ checkpoint,
27
+ device_map=device,
28
+ torch_dtype=torch.float32
29
+ )
30
+
31
+ persist_directory = "db"
32
+
33
+ @st.cache_resource
34
+ def data_ingestion():
35
+ for root, dirs, files in os.walk("docs"):
36
+ for file in files:
37
+ if file.endswith(".pdf"):
38
+ print(file)
39
+ loader = PDFMinerLoader(os.path.join(root, file))
40
+ documents = loader.load()
41
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=500)
42
+ texts = text_splitter.split_documents(documents)
43
+ #create embeddings here
44
+ embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
45
+ #create vector store here
46
+ db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory, client_settings=CHROMA_SETTINGS)
47
+ db.persist()
48
+ db=None
49
+
50
+ @st.cache_resource
51
+ def llm_pipeline():
52
+ pipe = pipeline(
53
+ 'text2text-generation',
54
+ model = base_model,
55
+ tokenizer = tokenizer,
56
+ max_length = 256,
57
+ do_sample = True,
58
+ temperature = 0.3,
59
+ top_p= 0.95,
60
+ device=device
61
+ )
62
+ local_llm = HuggingFacePipeline(pipeline=pipe)
63
+ return local_llm
64
+
65
+ @st.cache_resource
66
+ def qa_llm():
67
+ llm = llm_pipeline()
68
+ embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
69
+ db = Chroma(persist_directory="db", embedding_function = embeddings, client_settings=CHROMA_SETTINGS)
70
+ retriever = db.as_retriever()
71
+ qa = RetrievalQA.from_chain_type(
72
+ llm = llm,
73
+ chain_type = "stuff",
74
+ retriever = retriever,
75
+ return_source_documents=True
76
+ )
77
+ return qa
78
+
79
+ def process_answer(instruction):
80
+ response = ''
81
+ instruction = instruction
82
+ qa = qa_llm()
83
+ generated_text = qa(instruction)
84
+ answer = generated_text['result']
85
+ return answer
86
+
87
+ def get_file_size(file):
88
+ file.seek(0, os.SEEK_END)
89
+ file_size = file.tell()
90
+ file.seek(0)
91
+ return file_size
92
+
93
+ @st.cache_data
94
+ #function to display the PDF of a given file
95
+ def displayPDF(file):
96
+ # Opening file from file path
97
+ with open(file, "rb") as f:
98
+ base64_pdf = base64.b64encode(f.read()).decode('utf-8')
99
+
100
+ # Embedding PDF in HTML
101
+ pdf_display = F'<iframe src="data:application/pdf;base64,{base64_pdf}" width="100%" height="600" type="application/pdf"></iframe>'
102
+
103
+ # Displaying File
104
+ st.markdown(pdf_display, unsafe_allow_html=True)
105
+
106
+ # Display conversation history using Streamlit messages
107
+ def display_conversation(history):
108
+ for i in range(len(history["generated"])):
109
+ message(history["past"][i], is_user=True, key=str(i) + "_user")
110
+ message(history["generated"][i],key=str(i))
111
+
112
+ def main():
113
+ st.markdown("<h1 style='text-align: center; color: blue;'>Chat with your PDF 🦜📄 </h1>", unsafe_allow_html=True)
114
+ st.markdown("<h3 style='text-align: center; color: grey;'>Built by <a href='https://github.com/AIAnytime'>AI Anytime with ❤️ </a></h3>", unsafe_allow_html=True)
115
+
116
+ st.markdown("<h2 style='text-align: center; color:red;'>Upload your PDF 👇</h2>", unsafe_allow_html=True)
117
+
118
+ uploaded_file = st.file_uploader("", type=["pdf"])
119
+
120
+ if uploaded_file is not None:
121
+ file_details = {
122
+ "Filename": uploaded_file.name,
123
+ "File size": get_file_size(uploaded_file)
124
+ }
125
+ filepath = "docs/"+uploaded_file.name
126
+ with open(filepath, "wb") as temp_file:
127
+ temp_file.write(uploaded_file.read())
128
+
129
+ col1, col2= st.columns([1,2])
130
+ with col1:
131
+ st.markdown("<h4 style color:black;'>File details</h4>", unsafe_allow_html=True)
132
+ st.json(file_details)
133
+ st.markdown("<h4 style color:black;'>File preview</h4>", unsafe_allow_html=True)
134
+ pdf_view = displayPDF(filepath)
135
+
136
+ with col2:
137
+ with st.spinner('Embeddings are in process...'):
138
+ ingested_data = data_ingestion()
139
+ st.success('Embeddings are created successfully!')
140
+ st.markdown("<h4 style color:black;'>Chat Here</h4>", unsafe_allow_html=True)
141
+
142
+
143
+ user_input = st.text_input("", key="input")
144
+
145
+ # Initialize session state for generated responses and past messages
146
+ if "generated" not in st.session_state:
147
+ st.session_state["generated"] = ["I am ready to help you"]
148
+ if "past" not in st.session_state:
149
+ st.session_state["past"] = ["Hey there!"]
150
+
151
+ # Search the database for a response based on user input and update session state
152
+ if user_input:
153
+ answer = process_answer({'query': user_input})
154
+ st.session_state["past"].append(user_input)
155
+ response = answer
156
+ st.session_state["generated"].append(response)
157
+
158
+ # Display conversation history using Streamlit messages
159
+ if st.session_state["generated"]:
160
+ display_conversation(st.session_state)
161
+
162
+
163
+
164
+
165
+
166
+
167
+
168
+ if __name__ == "__main__":
169
+ main()
170
+
171
+
constants.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import chromadb
3
+ from chromadb.config import Settings
4
+ CHROMA_SETTINGS = Settings(
5
+ chroma_db_impl='duckdb+parquet',
6
+ persist_directory='db',
7
+ anonymized_telemetry=False
8
+ )
ingest.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.document_loaders import PyPDFLoader, DirectoryLoader, PDFMinerLoader
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+ from langchain.embeddings import SentenceTransformerEmbeddings
4
+ from langchain.vectorstores import Chroma
5
+ import os
6
+ from constants import CHROMA_SETTINGS
7
+
8
+ persist_directory = "db"
9
+
10
+ def main():
11
+ for root, dirs, files in os.walk("docs"):
12
+ for file in files:
13
+ if file.endswith(".pdf"):
14
+ print(file)
15
+ loader = PyPDFLoader(os.path.join(root, file))
16
+ documents = loader.load()
17
+ print("splitting into chunks")
18
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
19
+ texts = text_splitter.split_documents(documents)
20
+ #create embeddings here
21
+ print("Loading sentence transformers model")
22
+ embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
23
+ #create vector store here
24
+ print(f"Creating embeddings. May take some minutes...")
25
+ db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory, client_settings=CHROMA_SETTINGS)
26
+ db.persist()
27
+ db=None
28
+
29
+ print(f"Ingestion complete! You can now run privateGPT.py to query your documents")
30
+
31
+ if __name__ == "__main__":
32
+ main()
requirements.txt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain==0.0.267
2
+ streamlit==1.25.0
3
+ transformers==4.31.0
4
+ torch==2.0.1
5
+ einops==0.6.1
6
+ bitsandbytes==0.41.1
7
+ accelerate==0.21.0
8
+ pdfminer.six==20221105
9
+ bs4==0.0.1
10
+ sentence_transformers
11
+ duckdb==0.7.1
12
+ chromadb==0.3.26
13
+ beautifulsoup4==4.12.2
14
+ sentence-transformers==2.2.2
15
+ sentencepiece==0.1.99
16
+ six==1.16.0
17
+ requests==2.31.0
18
+ uvicorn==0.18.3
19
+ torch==2.0.1
20
+ torchvision==0.15.2
21
+ streamlit-chat