Spaces:

ahmedelbeshry
/

chatwithpdflangchain

Sleeping

App Files Files Community

ahmedelbeshry commited on Jun 11

Commit

86f3482

•

1 Parent(s): 2b05c60

Upload 11 files

Browse files

Files changed (12) hide show

.gitattributes +2 -0
.gitignore +168 -0
LICENSE +21 -0
README.md +53 -13
The-Finance-Act--2023.pdf +3 -0
app.py +176 -0
requirements.txt +0 -0
vectorstore/68314fe0-2cd2-4e99-bc89-3dc28b045b1a/data_level0.bin +3 -0
vectorstore/68314fe0-2cd2-4e99-bc89-3dc28b045b1a/header.bin +3 -0
vectorstore/68314fe0-2cd2-4e99-bc89-3dc28b045b1a/length.bin +3 -0
vectorstore/68314fe0-2cd2-4e99-bc89-3dc28b045b1a/link_lists.bin +3 -0
vectorstore/chroma.sqlite3 +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+The-Finance-Act--2023.pdf filter=lfs diff=lfs merge=lfs -text
+vectorstore/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,168 @@

+# Local storage
+documents/
+vectorstore/
+hf_model/
+#python env
+myvenv/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 Mojtaba Fayazi
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,13 +1,53 @@
----
-title: Chatwithpdflangchain
-emoji: 🔥
-colorFrom: red
-colorTo: purple
-sdk: streamlit
-sdk_version: 1.35.0
-app_file: app.py
-pinned: false
-license: mit
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# Chat With PDFs
+Chat with your PDF files for free, using [Langchain](https://python.langchain.com/docs/get_started/quickstart), [Groq](https://console.groq.com/), [Chroma](https://docs.trychroma.com/getting-started) vector store, and [Jina AI](https://jina.ai/embeddings/) embeddings. This repository contains a simple Python implementation of the RAG (Retrieval-Augmented-Generation) system. The RAG model is used to retrieve relevant chunks of the user PDF file based on user queries and provide informative responses.
+## Installation
+Follow these steps:
+1. Clone the repository
+   ```
+   git clone https://github.com/S4mpl3r/chat-with-pdf.git
+   ```
+2. Create a virtual environment and activate it (optional, but highly recommended).
+   ```
+   python -m venv .venv
+   Windows: .venv\Scripts\activate
+   Linux: source .venv/bin/activate
+   ```
+3. Install required packages:
+   ```
+   python -m pip install -r requirements.txt
+   ```
+4. Create a .env file in the root of the project and populate it with the following keys. You'll need to obtain your api keys:
+   ```
+   JINA_API_KEY=<YOUR KEY>
+   GROQ_API_KEY=<YOUR KEY>
+   HF_TOKEN=<YOUR TOKEN>
+   HF_HOME=<PATH TO STORE HUGGINGFACE MODEL>
+   ```
+5. Run the program:
+   ```
+   python main.py
+   ```
+## Configuration
+You can customize the behavior of the system by modifying the constants and parameters in the main.py file:
+* EMBED_MODEL_NAME: Specify the name of the Jina embedding model to be used.
+* LLM_NAME: Specify the name of the language model (Refer to [Groq](https://groq.com/) for the list of available models).
+* LLM_TEMPERATURE: Set the temperature parameter for the language model.
+* CHUNK_SIZE: Specify the maximum chunk size allowed by the embedding model.
+* DOCUMENT_DIR: Specify the directory where PDF documents are stored.
+* VECTOR_STORE_DIR: Specify the directory where vector embeddings are stored.
+* COLLECTION_NAME: Specify the name of the collection for the chroma vector store.
+## Resources
+Kudos to the amazing libraries and services listed below:
+* [Langchain](https://www.langchain.com/)
+* [Groq](https://groq.com/)
+* [Jina AI](https://jina.ai/)
+* [ChromaDB](https://www.trychroma.com/)
+## License
+MIT

The-Finance-Act--2023.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dd1abaa7b10154618f866c3399c9dc9655c0b2f3ff45ccbeb57b2590c0b904b7
+size 3668168

app.py ADDED Viewed

	@@ -0,0 +1,176 @@

+import os
+from typing import List
+import streamlit as st
+import chromadb
+from langchain.chains.combine_documents.stuff import create_stuff_documents_chain
+from langchain.chains.retrieval import create_retrieval_chain
+from langchain.docstore.document import Document
+from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
+from langchain_community.embeddings import JinaEmbeddings
+from langchain_community.vectorstores.chroma import Chroma
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.runnables import Runnable
+from langchain_core.vectorstores import VectorStoreRetriever
+from langchain_groq import ChatGroq
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from transformers import BertTokenizer
+# CONSTANTS =====================================================
+EMBED_MODEL_NAME = "jina-embeddings-v2-base-en"
+LLM_NAME = "mixtral-8x7b-32768"
+LLM_TEMPERATURE = 0.1
+# This is the maximum chunk size allowed by the chosen embedding model. You can choose a smaller size.
+CHUNK_SIZE = 8192
+DOCUMENT_DIR = "E:\\test\\chat-with-pdf"  # The directory where the PDF files should be placed
+VECTOR_STORE_DIR = "./vectorstore/"  # The directory where the vectors are stored
+COLLECTION_NAME = "collection1"  # ChromaDB collection name
+# ===============================================================
+# Define your Jina API key directly in the script
+JINA_API_KEY = 'jina_268f16cdd7f6410c850adbe32de20171ha3URkzQHnwlpDmy8-yhBXACVzXV'
+@st.cache_data
+def load_documents() -> List[Document]:
+    """Loads the PDF files within the DOCUMENT_DIR constant."""
+    try:
+        st.write("[+] Loading documents...")
+        documents = DirectoryLoader(
+            os.path.join(DOCUMENT_DIR), glob="**/*.pdf", loader_cls=PyPDFLoader
+        ).load()
+        st.success(f"[+] Document loaded, total pages: {len(documents)}")
+        return documents
+    except Exception as e:
+        st.error(f"[-] Error loading the document: {str(e)}")
+        return []
+@st.cache_data
+def chunk_document(_documents: List[Document]) -> List[Document]:
+    """Splits the input documents into maximum of CHUNK_SIZE chunks."""
+    tokenizer = BertTokenizer.from_pretrained(
+        "bert-base-uncased", cache_dir=os.environ.get("HF_HOME")
+    )
+    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
+        tokenizer=tokenizer,
+        chunk_size=CHUNK_SIZE,
+        chunk_overlap=CHUNK_SIZE // 50,
+    )
+    st.write(f"[+] Splitting documents...")
+    chunks = text_splitter.split_documents(_documents)
+    st.success(f"[+] Document splitting done, {len(chunks)} chunks total.")
+    return chunks
+@st.cache_resource
+def create_and_store_embeddings(_embedding_model, _chunks: List[Document]) -> Chroma:
+    """Calculates the embeddings and stores them in a chroma vectorstore."""
+    try:
+        vectorstore = Chroma.from_documents(
+            _chunks,
+            embedding=_embedding_model,
+            collection_name=COLLECTION_NAME,
+            persist_directory=VECTOR_STORE_DIR,
+        )
+        st.success("[+] Vectorstore created.")
+        return vectorstore
+    except Exception as e:
+        st.error(f"[-] Error creating and storing embeddings: {str(e)}")
+        raise
+@st.cache_resource
+def get_vectorstore_retriever(_embedding_model) -> VectorStoreRetriever:
+    """Returns the vectorstore."""
+    db = chromadb.PersistentClient(VECTOR_STORE_DIR)
+    try:
+        # Check for the existence of the vectorstore specified by the COLLECTION_NAME
+        db.get_collection(COLLECTION_NAME)
+        retriever = Chroma(
+            embedding_function=_embedding_model,
+            collection_name=COLLECTION_NAME,
+            persist_directory=VECTOR_STORE_DIR,
+        ).as_retriever(search_kwargs={"k": 3})
+    except ValueError:
+        # The vectorstore doesn't exist, so create it.
+        pdf = load_documents()
+        if not pdf:
+            raise ValueError("No documents were loaded.")
+        chunks = chunk_document(pdf)
+        retriever = create_and_store_embeddings(_embedding_model, chunks).as_retriever(
+            search_kwargs={"k": 3}
+        )
+    return retriever
+def create_rag_chain(embedding_model: JinaEmbeddings, llm: ChatGroq) -> Runnable:
+    """Creates the RAG chain."""
+    template = """Answer the question based only on the following context.
+    Think step by step before providing a detailed answer. I will give you
+    $500 if the user finds the response useful.
+    <context>
+    {context}
+    </context>
+    Question: {input}
+    """
+    prompt = ChatPromptTemplate.from_template(template)
+    document_chain = create_stuff_documents_chain(llm=llm, prompt=prompt)
+    retriever = get_vectorstore_retriever(embedding_model)
+    retrieval_chain = create_retrieval_chain(retriever, document_chain)
+    return retrieval_chain
+def run_chain(chain: Runnable, query: str) -> str:
+    """Run the RAG chain with the user query."""
+    try:
+        response = chain.invoke({"input": query})
+        context_output = ""
+        for doc in response["context"]:
+            context_output += f"[+] {doc.metadata} | content: {doc.page_content[:20]}...\n"
+        return context_output + "\n" + response["answer"]
+    except Exception as e:
+        st.error(f"[-] Error running the chain: {str(e)}")
+        return ""
+def main():
+    st.title("PDF Chat with RAG Chain")
+    # Initialize models
+    try:
+        embedding_model = JinaEmbeddings(
+            jina_api_key=JINA_API_KEY,
+            model_name=EMBED_MODEL_NAME,
+        )
+    except Exception as e:
+        st.error(f"[-] Failed to initialize JinaEmbeddings: {str(e)}")
+        return
+    try:
+        llm = ChatGroq(temperature=LLM_TEMPERATURE, model_name=LLM_NAME)
+    except Exception as e:
+        st.error(f"[-] Failed to initialize ChatGroq: {str(e)}")
+        return
+    # Create RAG chain
+    try:
+        chain = create_rag_chain(embedding_model=embedding_model, llm=llm)
+    except Exception as e:
+        st.error(f"[-] Failed to create RAG chain: {str(e)}")
+        return
+    # User input
+    query = st.text_input("Enter a prompt:", "")
+    if query:
+        with st.spinner("Processing..."):
+            response = run_chain(chain, query)
+            st.write(response)
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

Binary file (4.56 kB). View file

vectorstore/68314fe0-2cd2-4e99-bc89-3dc28b045b1a/data_level0.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a13e72541800c513c73dccea69f79e39cf4baef4fa23f7e117c0d6b0f5f99670
+size 3212000

vectorstore/68314fe0-2cd2-4e99-bc89-3dc28b045b1a/header.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0ec6df10978b056a10062ed99efeef2702fa4a1301fad702b53dd2517103c746
+size 100

vectorstore/68314fe0-2cd2-4e99-bc89-3dc28b045b1a/length.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a0eae05c88d9eb862f414814ef2ca3db48409fa5cda11369b4d26166d44f8188
+size 4000

vectorstore/68314fe0-2cd2-4e99-bc89-3dc28b045b1a/link_lists.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
+size 0

vectorstore/chroma.sqlite3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:61c46ac2742dce394db1d3c16bef58d90eb39b3dd201f1024aeaa525a4951498
+size 1003520