Spaces:

anilkumar-kanasani
/

chat-with-your-pdf

Runtime error

App Files Files Community

anilkumar-kanasani commited on Sep 22, 2023

Commit

e6f8d33

•

1 Parent(s): 21fce51

Upload 3 files

Browse files

Files changed (3) hide show

app.py +109 -0
requirements.txt +21 -0
utils.py +109 -0

app.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import streamlit as st
+from PyPDF2 import PdfReader
+from langchain.vectorstores import FAISS
+from langchain.chains import LLMChain, ConversationalRetrievalChain
+from utils import (get_hf_embeddings,
+                  get_openAI_chat_model,
+                  get_hf_model,
+                  get_local_gpt4_model,
+                  set_LangChain_tracking,
+                  check_password)
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.memory import ConversationBufferMemory
+from langchain.docstore.document import Document
+embeddings = get_hf_embeddings()
+openai_chat_model = get_openAI_chat_model()
+#local_model = get_local_gpt4_model(model = "GPT4All-13B-snoozy.ggmlv3.q4_0.bin")
+hf_chat_model = get_hf_model(repo_id = "tiiuae/falcon-40b")
+## Preparing Prompt
+from langchain.prompts import PromptTemplate
+entity_extraction_template = """
+Extract all top 10 important entites from the following context \
+return as python list \
+{input_text} \
+List of entities:"""
+ENTITY_EXTRACTION_PROMPT = PromptTemplate.from_template(entity_extraction_template)
+def get_qa_prompt(List_of_entities):
+    qa_template = """
+    Use the following pieces of context to answer the question at the end. \
+    Use the following list of entities as your working scope. \
+    If the question is out of given list of entities, just say that your question \
+    is out of scope and give them the list of entities as your working scope \
+    If you dont know the answer, just say that you don't know and tell \
+    the user to seach web for more information, don't try to make up \
+    an answer. Use three sentences maximum and keep the answer as \
+    concise as possible.\
+    list of entities: \
+    """ + str(List_of_entities) + """ \
+    context: {context} \
+    Question: {question} \
+    Helpful Answer:"""
+    print(qa_template)
+    QA_CHAIN_PROMPT = PromptTemplate.from_template(qa_template)
+    return QA_CHAIN_PROMPT
+if check_password():
+    st.title("Chat with your PDF ")
+    st.session_state.file_tracking = "new_run"
+    with st.expander("Upload your PDF : ", expanded=True):
+        st.session_state.lc_tracking = st.text_input("Please give a name to your session?")
+        input_file = st.file_uploader(label = "Upload a file",
+                            accept_multiple_files=False,
+                            type=["pdf"],
+                            )
+        if st.button("Process the file"):
+            st.session_state.file_tracking = "req_to_process"
+            try:
+                set_LangChain_tracking(project=str(st.session_state.lc_tracking))
+            except:
+                set_LangChain_tracking(project="default")
+        if st.session_state.file_tracking == "req_to_process" and input_file is not None:
+            # Load Text Data
+            input_text = ''
+            bytes_data = PdfReader(input_file)
+            for page in bytes_data.pages:
+                input_text += page.extract_text()
+            st.session_state.ner_chain = LLMChain(llm=openai_chat_model, prompt=ENTITY_EXTRACTION_PROMPT)
+            st.session_state.ners = st.session_state.ner_chain.run(input_text=input_text, verbose=True)
+            input_text = input_text.replace('\n', '')
+            text_doc_chunks = [Document(page_content=x, metadata={}) for x  in input_text.split('.')]
+            # Embed and VectorStore
+            vector_store = FAISS.from_documents(text_doc_chunks, embeddings)
+            st.session_state.chat_history = []
+            st.session_state.formatted_prompt = get_qa_prompt(st.session_state.ners)
+            st.session_state.chat_chain = ConversationalRetrievalChain.from_llm(
+                                                                    openai_chat_model,
+                                                                    chain_type="stuff", # "stuff", "map_reduce", "refine", "map_rerank"
+                                                                    verbose=True,
+                                                                    retriever=vector_store.as_retriever(),
+                                                                    # search_type="mmr"
+                                                                    # search_kwargs={"k": 1}
+                                                                    # search_type="similarity_score_threshold", search_kwargs={"score_threshold": .5}
+                                                                    combine_docs_chain_kwargs={"prompt": st.session_state.formatted_prompt},
+                                                                    )
+        if "chat_chain" in st.session_state:
+            st.header("We are ready to start chat with your pdf")
+            st.subheader("The scope of your PDF is: ")
+            st.markdown(st.session_state.ners)
+        else:
+            st.header("Upload and Process your file first")
+    if "chat_chain" in st.session_state and st.session_state.chat_history is not None:
+        if question := st.chat_input("Please type some thing here?"):
+            response = st.session_state.chat_chain({"question": question, "chat_history": st.session_state.chat_history})
+            st.session_state.chat_history.append((question, response["answer"]))
+        # Display chat messages from history on app rerun
+        for message in st.session_state.chat_history:
+            with st.chat_message("user"):
+                st.markdown(message[0])
+            with st.chat_message("assistant"):
+                st.markdown(message[1])

requirements.txt ADDED Viewed

	@@ -0,0 +1,21 @@

+# APIs
+gpt4all
+openai
+huggingface_hub
+# LLM Framework
+langchain
+# Chunking Dependencies
+tiktoken
+transformers
+# Embedding Dependencies
+InstructorEmbedding
+torch
+# Loading Dependencies
+PyPDF2
+pypdf
+# VectorStore Dependencies
+faiss-cpu
+# UI
+streamlit==1.25.0
+watchdog==3.0.0
+environs

utils.py ADDED Viewed

	@@ -0,0 +1,109 @@

+from environs import Env
+env = Env()
+try:
+    env.read_env("/Users/kanasani/Documents/api_keys/.env.llm")
+    print("Using local .env.llm file")
+except:
+    env.read_env()
+    print(".env file from repo secrets is used")
+import openai
+openai.api_type = env("API_TYPE")
+openai.api_base = env("API_BASE")
+openai.api_version = env("API_VERSION")
+openai.api_key = env("AZURE_OPENAI_KEY")
+def check_password():
+    import streamlit as st
+    """Returns `True` if the user had the correct password."""
+    def password_entered():
+        """Checks whether a password entered by the user is correct."""
+        if st.session_state["password"] == env("st_password"):
+            st.session_state["password_correct"] = True
+            del st.session_state["password"]  # don't store password
+        else:
+            st.session_state["password_correct"] = False
+    if "password_correct" not in st.session_state:
+        # First run, show input for password.
+        st.text_input(
+            "Password", type="password", on_change=password_entered, key="password"
+        )
+        return False
+    elif not st.session_state["password_correct"]:
+        # Password not correct, show input + error.
+        st.text_input(
+            "Password", type="password", on_change=password_entered, key="password"
+        )
+        st.error("😕 Password incorrect")
+        return False
+    else:
+        # Password correct.
+        return True
+def submit_prompt_to_gpt(input_list_of_prompts):
+    response = openai.ChatCompletion.create(
+        engine=env("DEPLOYMENT_NAME"),
+        messages=input_list_of_prompts,
+        temperature=1,
+        max_tokens=256,
+        top_p=1,
+        frequency_penalty=0,
+        presence_penalty=0,
+    )
+    response_content = response["choices"][0]["message"]["content"]
+    return response_content
+def get_hf_embeddings():
+    from langchain.embeddings import HuggingFaceHubEmbeddings
+    embeddings = HuggingFaceHubEmbeddings(
+        repo_id="sentence-transformers/all-mpnet-base-v2",
+        task="feature-extraction",
+        huggingfacehub_api_token=env("HUGGINGFACEHUB_API_TOKEN"),
+    )
+    return embeddings
+def get_openAI_chat_model():
+    import openai
+    from langchain.chat_models.azure_openai import AzureChatOpenAI
+    chat_model = AzureChatOpenAI(deployment_name=env("DEPLOYMENT_NAME"),
+                                openai_api_version=env("API_VERSION"),
+                                openai_api_base=env("API_BASE"),
+                                openai_api_type=env("API_TYPE"),
+                                openai_api_key=env("AZURE_OPENAI_KEY"),
+                                verbose=True)
+    return chat_model
+def get_hf_model(repo_id = "google/flan-t5-xxl"):
+    from langchain import HuggingFaceHub
+    hf_llm = HuggingFaceHub(
+        repo_id=repo_id,
+        model_kwargs={"temperature": 0.1, "max_length": 1024},
+        huggingfacehub_api_token = env("HUGGINGFACEHUB_API_TOKEN"),
+    )
+    return hf_llm
+def get_local_gpt4_model(model = "GPT4All-13B-snoozy.ggmlv3.q4_0.bin"):
+    from langchain.llms import GPT4All
+    gpt4_llm = GPT4All(model=".models/"+model,
+                       verbose=True)
+    return gpt4_llm
+def set_LangChain_tracking(project="Chat with your PDF"):
+    import os
+    os.environ['LANGCHAIN_PROJECT'] = project
+    print("LangChain tracking is set to : ", project)
+def unset_LangChain_tracking():
+    import os
+    os.environ.pop('LANGCHAIN_API_KEY', None)
+    os.environ.pop('LANGCHAIN_TRACING_V2', None)
+    os.environ.pop('LANGCHAIN_ENDPOINT', None)
+    os.environ.pop('LANGCHAIN_PROJECT', None)
+    print("LangChain tracking is removed .")