Spaces:

IMvision12
/

chatwithpdfs

Build error

App Files Files Community

IMvision12 commited on Mar 15

Commit

ba60257

•

1 Parent(s): 7e967db

Add

Browse files

Files changed (5) hide show

.gitignore +160 -0
app.py +134 -0
data.py +61 -0
model.py +73 -0
requirements.txt +10 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,160 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

app.py ADDED Viewed

	@@ -0,0 +1,134 @@

+from data import create_retriever
+from model import initialize_llmchain
+import streamlit as st
+import os
+from langchain.chains import RetrievalQA
+from streamlit_chat import message
+import sys
+__import__('pysqlite3')
+sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
+st.set_page_config(page_title="🤗Chat💬")
+embed_model_dict = {
+    "MiniLM-L6": "nreimers/MiniLM-L6-H384-uncased",
+    "Mpnet-Base": "sentence-transformers/all-mpnet-base-v2",
+}
+llm_model_dict = {
+    "Llama-2 7B (Free)" : "daryl149/llama-2-7b-chat-hf",
+    "Gemma 7B": "google/gemma-7b",
+    "Gemma 2B": "google/gemma-2b",
+    "Gemma 7B-it": "google/gemma-7b-it",
+    "Gemma 2B-it": "google/gemma-2b-it",
+    "Llama-2 7B Chat HF": "meta-llama/Llama-2-7b-chat-hf",
+    "Llama-2 70B Chat HF": "meta-llama/Llama-2-70b-chat-hf",
+    "Llama-2 13B Chat HF": "meta-llama/Llama-2-13b-chat-hf",
+    "Llama-2 70B": "meta-llama/Llama-2-70b",
+    "Llama-2 13B": "meta-llama/Llama-2-13b",
+    "Llama-2 7B": "meta-llama/Llama-2-7b",
+}
+def save_uploadedfile(uploadedfile):
+    if not os.path.exists("./tempfolder"):
+        os.makedirs("./tempfolder")
+    full_path = os.path.join("tempfolder", uploadedfile.name)
+    with open(full_path, "wb") as f:
+        f.write(uploadedfile.getbuffer())
+    return st.success("Saved File")
+with st.sidebar:
+    st.markdown(
+        f"""
+    <style>
+    section[data-testid="stSidebar"] .css-ng1t4o {{width: 100rem;}}
+    </style>
+  """,
+        unsafe_allow_html=True,
+    )
+    st.header("Choose and Configure your Embedding Model", divider="rainbow")
+    uploaded_files = st.file_uploader(
+        "Choose a file", type=["pdf"], accept_multiple_files=True
+    )
+    embed_model = embed_model_dict[
+        st.selectbox("Select Embedding Model", ("MiniLM-L6", "Mpnet-Base"))
+    ]
+    for file in uploaded_files:
+        save_uploadedfile(file)
+    chunksize = st.slider("Chunk Size", 256, 1024, 400, 10)
+    chunkoverlap = st.slider("Chunk Overlap", 100, 500, 300, 10)
+    st.header("Choose and Configure your LLM Model", divider="rainbow")
+    llm_model = llm_model_dict[
+        st.selectbox("Select LLM Model", (llm_model_dict.keys()))
+    ]
+    access_token = st.text_input("Enter HuggingFace Access Token")
+    temperature = st.slider("Temperature", 256, 1024, 400, 10)
+    max_tokens = st.slider("Max Tokens", 256, 1024, 400, 10)
+    top_k = st.slider("top_k", 256, 1024, 400, 10)
+    quantization_option = st.radio("Quantization Option", ("8Bit Quant", "4Bit Quant"))
+    load_in_4bit = True if quantization_option == "4Bit Quant" else False
+    load_in_8bit = True if quantization_option != "4Bit Quant" else False
+    if st.button("Submit"):
+        with st.spinner("Loading.... Processing PDFs..."):
+            retriever = create_retriever(
+                pdf_directory="./tempfolder",
+                chunk_size=chunksize,
+                chunk_overlap=chunkoverlap,
+                embedding_model_name=embed_model,
+            )
+        with st.spinner("Loading LLM Model...."):
+            llm = initialize_llmchain(
+                llm_model=llm_model,
+                temperature=temperature,
+                max_tokens=max_tokens,
+                top_k=top_k,
+                load_in_4bit=load_in_4bit,
+                load_in_8bit=load_in_8bit,
+                access_token=access_token,
+            )
+st.title("💬 Chat With PDFs")
+st.markdown("- Choose 🚀 and Configure your Embedding Model")
+st.markdown("- Choose 🚀 and COnfigure your LLM Model.")
+st.markdown("- Enter your HuggingFace Token ❗️(Only Llama-2 7B (Free) will work without HF Token)")
+st.markdown(
+    """
+    <p align="center">It will take some time <b>⏳</b> to download and load the models.</p>
+    <p align="center">Once download is complete you can start Chatting!.</p>
+    """,
+    unsafe_allow_html=True,
+)
+st.markdown('''
+<style>
+[data-testid="stMarkdownContainer"] ul{
+    padding-left:40px;
+}
+</style>
+''', unsafe_allow_html=True)
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+for message in st.session_state.messages:
+    with st.chat_message(message["role"]):
+        st.markdown(message["content"])
+if prompt := st.chat_input("What is up?", key="user_input"):
+    st.session_state.messages.append({"role": "user", "content": prompt})
+    with st.chat_message("user"):
+        st.markdown(prompt)
+    with st.chat_message("assistant"):
+        response = "Hi"
+        st.session_state.messages.append({"role": "assistant", "content": response})
+        st.markdown(response)

data.py ADDED Viewed

	@@ -0,0 +1,61 @@

+from langchain_community.document_loaders import PyPDFDirectoryLoader
+from typing import Optional, Dict
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.vectorstores import Chroma
+from langchain_community.embeddings import HuggingFaceEmbeddings
+import warnings
+warnings.simplefilter("ignore")
+def create_retriever(
+    pdf_directory: str,
+    chunk_size: int = 1000,
+    chunk_overlap: int = 100,
+    embedding_model_name: str = "sentence-transformers/all-mpnet-base-v2",
+    model_kwargs: Optional[Dict[str, str]] = {"device": "cpu"},
+):
+    """
+    Creates and returns a retriever object based on the provided PDF directory and configurations.
+    Args:
+    - pdf_directory (str): Path to the directory containing PDF files.
+    - chunk_size (int): Size of each chunk for splitting documents.
+    - chunk_overlap (int): Overlap size between adjacent chunks.
+    - embedding_model_name (str): Name of the HuggingFace embedding model to be used.
+    - model_kwargs (dict, optional): Additional keyword arguments for the embedding model.
+    Returns:
+    - retriever (Retriever): Retriever object for retrieving documents.
+    Raises:
+    - ValueError: If input values are invalid.
+    """
+    if chunk_size <= 0:
+        raise ValueError("Chunk size must be a positive integer.")
+    if chunk_overlap < 0 or chunk_overlap >= chunk_size:
+        raise ValueError(
+            "Chunk overlap must be a non-negative integer less than the chunk size."
+        )
+    # Load documents
+    loader = PyPDFDirectoryLoader(pdf_directory)
+    documents = loader.load()
+    # Split documents into small chunks
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=chunk_size, chunk_overlap=chunk_overlap
+    )
+    all_splits = text_splitter.split_documents(documents)
+    # Specify embedding model
+    embeddings = HuggingFaceEmbeddings(
+        model_name=embedding_model_name, model_kwargs=model_kwargs
+    )
+    # Embed document chunks
+    vectordb = Chroma.from_documents(
+        documents=all_splits, embedding=embeddings, persist_directory="chroma_db"
+    )
+    # Create and return retriever
+    retriever = vectordb.as_retriever()
+    return retriever

model.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
+from langchain.llms import HuggingFacePipeline
+from transformers import BitsAndBytesConfig
+def initialize_llmchain(
+    llm_model: str,
+    temperature: float,
+    max_tokens: int,
+    top_k: int,
+    access_token: str = None,
+    torch_dtype: str = "auto",
+    load_in_8bit: bool = False,
+    load_in_4bit: bool = False,
+) -> HuggingFacePipeline:
+    """
+    Initializes a language model chain based on the provided parameters.
+    Args:
+    - llm_model (str): The name of the language model to initialize.
+    - temperature (float): The temperature parameter for text generation.
+    - max_tokens (int): The maximum number of tokens to generate.
+    - top_k (int): The top-k parameter for token selection during generation.
+    - torch_dtype (str): The torch dtype to be used for model inference (default is "auto").
+    - load_in_8bit (bool): Whether to load the model in 8-bit format (default is False).
+    - load_in_4bit (bool): Whether to load the model in 4-bit format (default is False).
+    Returns:
+    - HuggingFacePipeline: Initialized language model pipeline.
+    """
+    if load_in_8bit:
+        bnb_config = BitsAndBytesConfig(
+            load_in_8bit=True
+        )
+    elif load_in_4bit:
+        bnb_config = BitsAndBytesConfig(
+            load_in_8bit=False,
+            load_in_4bit=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_compute_dtype=torch.bfloat16
+        )
+    else:
+        bnb_config = None
+    model_kwargs = {
+        "temperature": temperature,
+        "max_new_tokens": max_tokens,
+        "top_k": top_k,
+        "torch_dtype": torch_dtype,
+    }
+    # Initialize model and tokenizer
+    model = AutoModelForCausalLM.from_pretrained(
+        llm_model,
+        low_cpu_mem_usage=True,
+        quantization_config=bnb_config
+    )
+    tokenizer = AutoTokenizer.from_pretrained(llm_model)
+    # Initialize pipeline
+    pipe = pipeline(
+        task="text-generation",
+        model=model,
+        tokenizer=tokenizer,
+        token=access_token,
+        model_kwargs=model_kwargs,
+        pad_token_id=tokenizer.eos_token_id,
+    )
+    llm = HuggingFacePipeline(pipeline=pipe)
+    return llm

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+pypdf
+langchain
+sentence-transformers
+peft
+chromadb
+accelerate==0.28.0
+bitsandbytes==0.43.0
+streamlit
+streamlit-chat
+pysqlite3-binary