SBairagi commited on
Commit
ce02fbb
1 Parent(s): 2d8707e

Upload 6 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ healthy-recipes.pdf filter=lfs diff=lfs merge=lfs -text
Mind is your Business.pdf ADDED
Binary file (766 kB). View file
 
Mind is your Business.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
3
+ size 0
app.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Import Libraries
2
+
3
+ import streamlit as st
4
+ from dotenv import load_dotenv
5
+ import pickle
6
+ from PyPDF2 import PdfReader
7
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
8
+ from langchain.embeddings.openai import OpenAIEmbeddings
9
+ from langchain.vectorstores import FAISS
10
+ from langchain.llms import OpenAI
11
+ from langchain.chains.question_answering import load_qa_chain
12
+ from langchain.callbacks import get_openai_callback
13
+ import os
14
+ load_dotenv()
15
+
16
+ ## Reading the PDF
17
+
18
+ st.header("Chat with your PDF 💬")
19
+
20
+ pdf = st.file_uploader("Upload your PDF", type='pdf') # upload a PDF file
21
+ if pdf is not None:
22
+ pdf_reader = PdfReader(pdf) # read the pdf file
23
+
24
+ text = "" # collect all text data in this variable
25
+ for page in pdf_reader.pages:
26
+ text += page.extract_text()
27
+
28
+ #st.write(text)
29
+
30
+ ## Forming chunks of data
31
+
32
+ text_splitter = RecursiveCharacterTextSplitter(
33
+ chunk_size=1000, # 1000 tokens in each chunk
34
+ chunk_overlap=200, # 2oo tokens will have overlap in consecutive chunks
35
+ length_function=len
36
+ )
37
+
38
+ chunks = text_splitter.split_text(text=text) # forming and collecting chunks here
39
+ # st.write(chunks)
40
+
41
+ ## Create Embeddings of each chunk of data and store them in the Vector DB
42
+
43
+ store_name = pdf.name[:-4] # extract the pdf name
44
+ embeddings = OpenAIEmbeddings(openai_api_key = os.environ["OpenAI_API_KEY"]) # using OpenAI to create embeddings
45
+
46
+ if os.path.exists(f"{store_name}"): # if already the vector db is present then load it
47
+ #path = f"{store_name}\index.pkl"
48
+ VectorStore = FAISS.load_local(f"{store_name}",embeddings,allow_dangerous_deserialization=True)
49
+
50
+ st.write('Vector Database already exists.')
51
+
52
+ else:
53
+ VectorStore = FAISS.from_texts(chunks, embedding=embeddings) # providing the input chunks to create embeddings
54
+
55
+ VectorStore.save_local(f"{store_name}")
56
+ st.write('Creating new embeddings.')
57
+
58
+ ## Accepting query from user
59
+
60
+ query = st.text_input("Ask questions about your PDF file:")
61
+ #st.write(query)
62
+
63
+ if query:
64
+ docs = VectorStore.similarity_search(query=query, k=3)
65
+
66
+ llm = OpenAI()
67
+ chain = load_qa_chain(llm=llm, chain_type="stuff")
68
+ with get_openai_callback() as cb:
69
+ response = chain.run(input_documents=docs, question=query)
70
+ print(cb)
71
+ st.success(response)
healthy-recipes.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a506537e14017aef4761e84ceb212f707484170ae7c493b9d7431136a62f83a
3
+ size 3690108
notebook.ipynb ADDED
File without changes
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ PyPDF2
3
+ python-dotenv
4
+ streamlit
5
+ faiss-cpu
6
+ streamlit-extras
7
+ openai
8
+ tiktoken