ahmedelbeshry commited on
Commit
86f3482
1 Parent(s): 2b05c60

Upload 11 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ The-Finance-Act--2023.pdf filter=lfs diff=lfs merge=lfs -text
37
+ vectorstore/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Local storage
2
+ documents/
3
+ vectorstore/
4
+ hf_model/
5
+
6
+ #python env
7
+ myvenv/
8
+
9
+ # Byte-compiled / optimized / DLL files
10
+ __pycache__/
11
+ *.py[cod]
12
+ *$py.class
13
+
14
+ # C extensions
15
+ *.so
16
+
17
+ # Distribution / packaging
18
+ .Python
19
+ build/
20
+ develop-eggs/
21
+ dist/
22
+ downloads/
23
+ eggs/
24
+ .eggs/
25
+ lib/
26
+ lib64/
27
+ parts/
28
+ sdist/
29
+ var/
30
+ wheels/
31
+ share/python-wheels/
32
+ *.egg-info/
33
+ .installed.cfg
34
+ *.egg
35
+ MANIFEST
36
+
37
+ # PyInstaller
38
+ # Usually these files are written by a python script from a template
39
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
40
+ *.manifest
41
+ *.spec
42
+
43
+ # Installer logs
44
+ pip-log.txt
45
+ pip-delete-this-directory.txt
46
+
47
+ # Unit test / coverage reports
48
+ htmlcov/
49
+ .tox/
50
+ .nox/
51
+ .coverage
52
+ .coverage.*
53
+ .cache
54
+ nosetests.xml
55
+ coverage.xml
56
+ *.cover
57
+ *.py,cover
58
+ .hypothesis/
59
+ .pytest_cache/
60
+ cover/
61
+
62
+ # Translations
63
+ *.mo
64
+ *.pot
65
+
66
+ # Django stuff:
67
+ *.log
68
+ local_settings.py
69
+ db.sqlite3
70
+ db.sqlite3-journal
71
+
72
+ # Flask stuff:
73
+ instance/
74
+ .webassets-cache
75
+
76
+ # Scrapy stuff:
77
+ .scrapy
78
+
79
+ # Sphinx documentation
80
+ docs/_build/
81
+
82
+ # PyBuilder
83
+ .pybuilder/
84
+ target/
85
+
86
+ # Jupyter Notebook
87
+ .ipynb_checkpoints
88
+
89
+ # IPython
90
+ profile_default/
91
+ ipython_config.py
92
+
93
+ # pyenv
94
+ # For a library or package, you might want to ignore these files since the code is
95
+ # intended to run in multiple environments; otherwise, check them in:
96
+ # .python-version
97
+
98
+ # pipenv
99
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
100
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
101
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
102
+ # install all needed dependencies.
103
+ #Pipfile.lock
104
+
105
+ # poetry
106
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
107
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
108
+ # commonly ignored for libraries.
109
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
110
+ #poetry.lock
111
+
112
+ # pdm
113
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
114
+ #pdm.lock
115
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
116
+ # in version control.
117
+ # https://pdm.fming.dev/#use-with-ide
118
+ .pdm.toml
119
+
120
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121
+ __pypackages__/
122
+
123
+ # Celery stuff
124
+ celerybeat-schedule
125
+ celerybeat.pid
126
+
127
+ # SageMath parsed files
128
+ *.sage.py
129
+
130
+ # Environments
131
+ .env
132
+ .venv
133
+ env/
134
+ venv/
135
+ ENV/
136
+ env.bak/
137
+ venv.bak/
138
+
139
+ # Spyder project settings
140
+ .spyderproject
141
+ .spyproject
142
+
143
+ # Rope project settings
144
+ .ropeproject
145
+
146
+ # mkdocs documentation
147
+ /site
148
+
149
+ # mypy
150
+ .mypy_cache/
151
+ .dmypy.json
152
+ dmypy.json
153
+
154
+ # Pyre type checker
155
+ .pyre/
156
+
157
+ # pytype static type analyzer
158
+ .pytype/
159
+
160
+ # Cython debug symbols
161
+ cython_debug/
162
+
163
+ # PyCharm
164
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
167
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
168
+ #.idea/
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Mojtaba Fayazi
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,13 +1,53 @@
1
- ---
2
- title: Chatwithpdflangchain
3
- emoji: 🔥
4
- colorFrom: red
5
- colorTo: purple
6
- sdk: streamlit
7
- sdk_version: 1.35.0
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Chat With PDFs
2
+ Chat with your PDF files for free, using [Langchain](https://python.langchain.com/docs/get_started/quickstart), [Groq](https://console.groq.com/), [Chroma](https://docs.trychroma.com/getting-started) vector store, and [Jina AI](https://jina.ai/embeddings/) embeddings. This repository contains a simple Python implementation of the RAG (Retrieval-Augmented-Generation) system. The RAG model is used to retrieve relevant chunks of the user PDF file based on user queries and provide informative responses.
3
+
4
+ ## Installation
5
+ Follow these steps:
6
+ 1. Clone the repository
7
+ ```
8
+ git clone https://github.com/S4mpl3r/chat-with-pdf.git
9
+ ```
10
+ 2. Create a virtual environment and activate it (optional, but highly recommended).
11
+ ```
12
+ python -m venv .venv
13
+ Windows: .venv\Scripts\activate
14
+ Linux: source .venv/bin/activate
15
+ ```
16
+ 3. Install required packages:
17
+ ```
18
+ python -m pip install -r requirements.txt
19
+ ```
20
+ 4. Create a .env file in the root of the project and populate it with the following keys. You'll need to obtain your api keys:
21
+ ```
22
+ JINA_API_KEY=<YOUR KEY>
23
+ GROQ_API_KEY=<YOUR KEY>
24
+ HF_TOKEN=<YOUR TOKEN>
25
+ HF_HOME=<PATH TO STORE HUGGINGFACE MODEL>
26
+ ```
27
+ 5. Run the program:
28
+ ```
29
+ python main.py
30
+ ```
31
+ ## Configuration
32
+ You can customize the behavior of the system by modifying the constants and parameters in the main.py file:
33
+
34
+ * EMBED_MODEL_NAME: Specify the name of the Jina embedding model to be used.
35
+ * LLM_NAME: Specify the name of the language model (Refer to [Groq](https://groq.com/) for the list of available models).
36
+ * LLM_TEMPERATURE: Set the temperature parameter for the language model.
37
+ * CHUNK_SIZE: Specify the maximum chunk size allowed by the embedding model.
38
+ * DOCUMENT_DIR: Specify the directory where PDF documents are stored.
39
+ * VECTOR_STORE_DIR: Specify the directory where vector embeddings are stored.
40
+ * COLLECTION_NAME: Specify the name of the collection for the chroma vector store.
41
+
42
+ ## Resources
43
+ Kudos to the amazing libraries and services listed below:
44
+ * [Langchain](https://www.langchain.com/)
45
+ * [Groq](https://groq.com/)
46
+ * [Jina AI](https://jina.ai/)
47
+ * [ChromaDB](https://www.trychroma.com/)
48
+
49
+ ## License
50
+ MIT
51
+
52
+
53
+
The-Finance-Act--2023.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd1abaa7b10154618f866c3399c9dc9655c0b2f3ff45ccbeb57b2590c0b904b7
3
+ size 3668168
app.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import List
3
+ import streamlit as st
4
+ import chromadb
5
+ from langchain.chains.combine_documents.stuff import create_stuff_documents_chain
6
+ from langchain.chains.retrieval import create_retrieval_chain
7
+ from langchain.docstore.document import Document
8
+ from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
9
+ from langchain_community.embeddings import JinaEmbeddings
10
+ from langchain_community.vectorstores.chroma import Chroma
11
+ from langchain_core.prompts import ChatPromptTemplate
12
+ from langchain_core.runnables import Runnable
13
+ from langchain_core.vectorstores import VectorStoreRetriever
14
+ from langchain_groq import ChatGroq
15
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
16
+ from transformers import BertTokenizer
17
+
18
+ # CONSTANTS =====================================================
19
+ EMBED_MODEL_NAME = "jina-embeddings-v2-base-en"
20
+ LLM_NAME = "mixtral-8x7b-32768"
21
+ LLM_TEMPERATURE = 0.1
22
+
23
+ # This is the maximum chunk size allowed by the chosen embedding model. You can choose a smaller size.
24
+ CHUNK_SIZE = 8192
25
+
26
+ DOCUMENT_DIR = "E:\\test\\chat-with-pdf" # The directory where the PDF files should be placed
27
+ VECTOR_STORE_DIR = "./vectorstore/" # The directory where the vectors are stored
28
+ COLLECTION_NAME = "collection1" # ChromaDB collection name
29
+ # ===============================================================
30
+
31
+ # Define your Jina API key directly in the script
32
+ JINA_API_KEY = 'jina_268f16cdd7f6410c850adbe32de20171ha3URkzQHnwlpDmy8-yhBXACVzXV'
33
+
34
+ @st.cache_data
35
+ def load_documents() -> List[Document]:
36
+ """Loads the PDF files within the DOCUMENT_DIR constant."""
37
+ try:
38
+ st.write("[+] Loading documents...")
39
+
40
+ documents = DirectoryLoader(
41
+ os.path.join(DOCUMENT_DIR), glob="**/*.pdf", loader_cls=PyPDFLoader
42
+ ).load()
43
+ st.success(f"[+] Document loaded, total pages: {len(documents)}")
44
+
45
+ return documents
46
+ except Exception as e:
47
+ st.error(f"[-] Error loading the document: {str(e)}")
48
+ return []
49
+
50
+ @st.cache_data
51
+ def chunk_document(_documents: List[Document]) -> List[Document]:
52
+ """Splits the input documents into maximum of CHUNK_SIZE chunks."""
53
+ tokenizer = BertTokenizer.from_pretrained(
54
+ "bert-base-uncased", cache_dir=os.environ.get("HF_HOME")
55
+ )
56
+ text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
57
+ tokenizer=tokenizer,
58
+ chunk_size=CHUNK_SIZE,
59
+ chunk_overlap=CHUNK_SIZE // 50,
60
+ )
61
+
62
+ st.write(f"[+] Splitting documents...")
63
+ chunks = text_splitter.split_documents(_documents)
64
+ st.success(f"[+] Document splitting done, {len(chunks)} chunks total.")
65
+
66
+ return chunks
67
+
68
+ @st.cache_resource
69
+ def create_and_store_embeddings(_embedding_model, _chunks: List[Document]) -> Chroma:
70
+ """Calculates the embeddings and stores them in a chroma vectorstore."""
71
+ try:
72
+ vectorstore = Chroma.from_documents(
73
+ _chunks,
74
+ embedding=_embedding_model,
75
+ collection_name=COLLECTION_NAME,
76
+ persist_directory=VECTOR_STORE_DIR,
77
+ )
78
+ st.success("[+] Vectorstore created.")
79
+ return vectorstore
80
+ except Exception as e:
81
+ st.error(f"[-] Error creating and storing embeddings: {str(e)}")
82
+ raise
83
+
84
+ @st.cache_resource
85
+ def get_vectorstore_retriever(_embedding_model) -> VectorStoreRetriever:
86
+ """Returns the vectorstore."""
87
+ db = chromadb.PersistentClient(VECTOR_STORE_DIR)
88
+ try:
89
+ # Check for the existence of the vectorstore specified by the COLLECTION_NAME
90
+ db.get_collection(COLLECTION_NAME)
91
+ retriever = Chroma(
92
+ embedding_function=_embedding_model,
93
+ collection_name=COLLECTION_NAME,
94
+ persist_directory=VECTOR_STORE_DIR,
95
+ ).as_retriever(search_kwargs={"k": 3})
96
+ except ValueError:
97
+ # The vectorstore doesn't exist, so create it.
98
+ pdf = load_documents()
99
+ if not pdf:
100
+ raise ValueError("No documents were loaded.")
101
+ chunks = chunk_document(pdf)
102
+ retriever = create_and_store_embeddings(_embedding_model, chunks).as_retriever(
103
+ search_kwargs={"k": 3}
104
+ )
105
+ return retriever
106
+
107
+ def create_rag_chain(embedding_model: JinaEmbeddings, llm: ChatGroq) -> Runnable:
108
+ """Creates the RAG chain."""
109
+ template = """Answer the question based only on the following context.
110
+ Think step by step before providing a detailed answer. I will give you
111
+ $500 if the user finds the response useful.
112
+ <context>
113
+ {context}
114
+ </context>
115
+
116
+ Question: {input}
117
+ """
118
+ prompt = ChatPromptTemplate.from_template(template)
119
+
120
+ document_chain = create_stuff_documents_chain(llm=llm, prompt=prompt)
121
+
122
+ retriever = get_vectorstore_retriever(embedding_model)
123
+
124
+ retrieval_chain = create_retrieval_chain(retriever, document_chain)
125
+
126
+ return retrieval_chain
127
+
128
+ def run_chain(chain: Runnable, query: str) -> str:
129
+ """Run the RAG chain with the user query."""
130
+ try:
131
+ response = chain.invoke({"input": query})
132
+
133
+ context_output = ""
134
+ for doc in response["context"]:
135
+ context_output += f"[+] {doc.metadata} | content: {doc.page_content[:20]}...\n"
136
+
137
+ return context_output + "\n" + response["answer"]
138
+ except Exception as e:
139
+ st.error(f"[-] Error running the chain: {str(e)}")
140
+ return ""
141
+
142
+ def main():
143
+ st.title("PDF Chat with RAG Chain")
144
+
145
+ # Initialize models
146
+ try:
147
+ embedding_model = JinaEmbeddings(
148
+ jina_api_key=JINA_API_KEY,
149
+ model_name=EMBED_MODEL_NAME,
150
+ )
151
+ except Exception as e:
152
+ st.error(f"[-] Failed to initialize JinaEmbeddings: {str(e)}")
153
+ return
154
+
155
+ try:
156
+ llm = ChatGroq(temperature=LLM_TEMPERATURE, model_name=LLM_NAME)
157
+ except Exception as e:
158
+ st.error(f"[-] Failed to initialize ChatGroq: {str(e)}")
159
+ return
160
+
161
+ # Create RAG chain
162
+ try:
163
+ chain = create_rag_chain(embedding_model=embedding_model, llm=llm)
164
+ except Exception as e:
165
+ st.error(f"[-] Failed to create RAG chain: {str(e)}")
166
+ return
167
+
168
+ # User input
169
+ query = st.text_input("Enter a prompt:", "")
170
+ if query:
171
+ with st.spinner("Processing..."):
172
+ response = run_chain(chain, query)
173
+ st.write(response)
174
+
175
+ if __name__ == "__main__":
176
+ main()
requirements.txt ADDED
Binary file (4.56 kB). View file
 
vectorstore/68314fe0-2cd2-4e99-bc89-3dc28b045b1a/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a13e72541800c513c73dccea69f79e39cf4baef4fa23f7e117c0d6b0f5f99670
3
+ size 3212000
vectorstore/68314fe0-2cd2-4e99-bc89-3dc28b045b1a/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ec6df10978b056a10062ed99efeef2702fa4a1301fad702b53dd2517103c746
3
+ size 100
vectorstore/68314fe0-2cd2-4e99-bc89-3dc28b045b1a/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0eae05c88d9eb862f414814ef2ca3db48409fa5cda11369b4d26166d44f8188
3
+ size 4000
vectorstore/68314fe0-2cd2-4e99-bc89-3dc28b045b1a/link_lists.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
3
+ size 0
vectorstore/chroma.sqlite3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61c46ac2742dce394db1d3c16bef58d90eb39b3dd201f1024aeaa525a4951498
3
+ size 1003520