Spaces:

SBairagi
/

PDF_Query_Langchain

Sleeping

App Files Files Community

SBairagi commited on Mar 26

Commit

ce02fbb

•

1 Parent(s): 2d8707e

Upload 6 files

Browse files

Files changed (7) hide show

.gitattributes +1 -0
Mind is your Business.pdf +0 -0
Mind is your Business.pkl +3 -0
app.py +71 -0
healthy-recipes.pdf +3 -0
notebook.ipynb +0 -0
requirements.txt +8 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+healthy-recipes.pdf filter=lfs diff=lfs merge=lfs -text

Mind is your Business.pdf ADDED Viewed

Binary file (766 kB). View file

Mind is your Business.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
+size 0

app.py ADDED Viewed

	@@ -0,0 +1,71 @@

+## Import Libraries
+import streamlit as st
+from dotenv import load_dotenv
+import pickle
+from PyPDF2 import PdfReader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.vectorstores import FAISS
+from langchain.llms import OpenAI
+from langchain.chains.question_answering import load_qa_chain
+from langchain.callbacks import get_openai_callback
+import os
+load_dotenv()
+## Reading the PDF
+st.header("Chat with your PDF 💬")
+pdf = st.file_uploader("Upload your PDF", type='pdf') # upload a PDF file
+if pdf is not None:
+    pdf_reader = PdfReader(pdf) # read the pdf file
+    text = "" # collect all text data in this variable
+    for page in pdf_reader.pages:
+        text += page.extract_text()
+    #st.write(text)
+ ## Forming chunks of data
+    text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=1000, # 1000 tokens in each chunk
+            chunk_overlap=200, # 2oo tokens will have overlap in consecutive chunks
+            length_function=len
+            )
+    chunks = text_splitter.split_text(text=text) # forming and collecting chunks here
+    # st.write(chunks)
+## Create Embeddings of each chunk of data and store them in the Vector DB
+    store_name = pdf.name[:-4] # extract the pdf name
+    embeddings = OpenAIEmbeddings(openai_api_key = os.environ["OpenAI_API_KEY"]) # using OpenAI to create embeddings
+    if os.path.exists(f"{store_name}"): # if already the vector db is present then load it
+        #path = f"{store_name}\index.pkl"
+        VectorStore = FAISS.load_local(f"{store_name}",embeddings,allow_dangerous_deserialization=True)
+        st.write('Vector Database already exists.')
+    else:
+        VectorStore = FAISS.from_texts(chunks, embedding=embeddings) # providing the input chunks to create embeddings
+        VectorStore.save_local(f"{store_name}")
+        st.write('Creating new embeddings.')
+## Accepting query from user
+    query = st.text_input("Ask questions about your PDF file:")
+    #st.write(query)
+    if query:
+        docs = VectorStore.similarity_search(query=query, k=3)
+        llm = OpenAI()
+        chain = load_qa_chain(llm=llm, chain_type="stuff")
+        with get_openai_callback() as cb:
+            response = chain.run(input_documents=docs, question=query)
+            print(cb)
+        st.success(response)

healthy-recipes.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2a506537e14017aef4761e84ceb212f707484170ae7c493b9d7431136a62f83a
+size 3690108

notebook.ipynb ADDED Viewed

File without changes

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+langchain
+PyPDF2
+python-dotenv
+streamlit
+faiss-cpu
+streamlit-extras
+openai
+tiktoken