NickNYU commited on
Commit
bb37df0
1 Parent(s): e013e04

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -54
app.py CHANGED
@@ -3,6 +3,7 @@ import streamlit as st
3
  import os
4
  import pickle
5
  from PyPDF2 import PdfReader
 
6
  from langchain.text_splitter import RecursiveCharacterTextSplitter
7
  from langchain.embeddings.openai import OpenAIEmbeddings
8
  from langchain.vectorstores import FAISS
@@ -29,65 +30,42 @@ with st.sidebar:
29
  def main():
30
  st.header("Chat with PDF 💬")
31
 
32
- # upload a PDF file
33
- pdf = st.file_uploader("Upload your PDF", type='pdf')
34
-
35
- if pdf is not None:
36
- pdf_reader = PdfReader(pdf)
37
-
38
- text = ""
39
- for page in pdf_reader.pages:
40
- text += page.extract_text()
41
-
 
42
  text_splitter = RecursiveCharacterTextSplitter(
43
  chunk_size=512,
44
  chunk_overlap=128,
45
  length_function=len
46
  )
47
- chunks = text_splitter.split_text(text=text)
48
-
49
- # # embeddings
50
- store_name = pdf.name[:-4]
51
- st.write(f'{store_name}')
52
-
53
- if os.path.exists(f"{store_name}.pkl"):
54
- with open(f"{store_name}.pkl", "rb") as f:
55
- VectorStore = pickle.load(f)
56
- st.write('Embeddings Loaded from the Disk')
57
- else:
58
- st.write('Embeddings calculate to the Pinecone')
59
- embeddings = OpenAIEmbeddings()
60
- VectorStore = FAISS.from_texts(chunks, embedding=embeddings)
61
- print(VectorStore)
62
- with open(f"{store_name}.pkl", "wb") as f:
63
- pickle.dump(VectorStore, f)
64
-
65
-
66
- # PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY', '894d5f1f-df46-4b01-8407-d9977eaee2eb')
67
- # PINECONE_API_ENV = os.environ.get('PINECONE_API_ENV',
68
- # 'asia-southeast1-gcp-free') # You may need to switch with your env
69
- # embeddings = OpenAIEmbeddings()
70
- # # initialize pinecone
71
- # pinecone.init(
72
- # api_key=PINECONE_API_KEY, # find at app.pinecone.io
73
- # environment=PINECONE_API_ENV # next to api key in console
74
- # )
75
- # index_name = "indexer" # put in the name of your pinecone index here
76
- # VectorStore = Pinecone.from_texts(chunks, embeddings, index_name=index_name)
77
-
78
- # Accept user questions/query
79
- query = st.text_input("Ask questions about your PDF file:")
80
- # st.write(query)
81
-
82
- if query:
83
- docs = VectorStore.similarity_search(query=query, k=3)
84
-
85
- llm = OpenAI()
86
- chain = load_qa_chain(llm=llm, chain_type="stuff")
87
- with get_openai_callback() as cb:
88
- response = chain.run(input_documents=docs, question=query)
89
- print(cb)
90
- st.write(response)
91
 
92
 
93
  if __name__ == '__main__':
 
3
  import os
4
  import pickle
5
  from PyPDF2 import PdfReader
6
+ from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader
7
  from langchain.text_splitter import RecursiveCharacterTextSplitter
8
  from langchain.embeddings.openai import OpenAIEmbeddings
9
  from langchain.vectorstores import FAISS
 
30
  def main():
31
  st.header("Chat with PDF 💬")
32
 
33
+ # # embeddings
34
+ store_name = "coffee"
35
+
36
+ if os.path.exists(f"{store_name}.pkl"):
37
+ with open(f"{store_name}.pkl", "rb") as f:
38
+ VectorStore = pickle.load(f)
39
+ st.write('Embeddings Loaded from the Disk')
40
+ else:
41
+ st.write('Reading from prompt ...')
42
+ loader = PyPDFLoader("./咖啡语料.pdf")
43
+ data = loader.load()
44
  text_splitter = RecursiveCharacterTextSplitter(
45
  chunk_size=512,
46
  chunk_overlap=128,
47
  length_function=len
48
  )
49
+ texts = text_splitter.split_documents(data)
50
+ embeddings = OpenAIEmbeddings()
51
+ VectorStore = FAISS.from_texts([t.page_content for t in texts], embedding=embeddings)
52
+ with open(f"{store_name}.pkl", "wb") as f:
53
+ pickle.dump(VectorStore, f)
54
+
55
+
56
+
57
+ query = st.text_input("Ask questions about Starbucks coffee:")
58
+
59
+
60
+ if query:
61
+ docs = VectorStore.similarity_search(query=query, k=3)
62
+
63
+ llm = OpenAI()
64
+ chain = load_qa_chain(llm=llm, chain_type="stuff")
65
+ with get_openai_callback() as cb:
66
+ response = chain.run(input_documents=docs, question=query)
67
+ print(cb)
68
+ st.write(response)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
 
71
  if __name__ == '__main__':