ryanrwatkins commited on
Commit
5320c7c
1 Parent(s): 5cc5022

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -16
app.py CHANGED
@@ -7,6 +7,15 @@ import langchain
7
  import chromadb
8
  import glob
9
 
 
 
 
 
 
 
 
 
 
10
 
11
  from langchain.embeddings.openai import OpenAIEmbeddings
12
  from langchain.vectorstores import Chroma
@@ -28,6 +37,9 @@ from langchain.chains.question_answering import load_qa_chain
28
 
29
 
30
 
 
 
 
31
  def get_empty_state():
32
  return {"total_tokens": 0, "messages": []}
33
 
@@ -69,22 +81,51 @@ def submit_message(prompt, prompt_template, temperature, max_tokens, context_len
69
  os.environ["OPENAI_API_KEY"] = os.environ['openai_key']
70
 
71
  # load in all the files
72
- path = './files'
 
73
  #pdf_files = glob.glob(os.path.join(path, "*.pdf"))
74
- pdf_files = glob.glob(os.path.join(path, "*.pdf"))
75
 
76
- for file in pdf_files:
77
- loader = PyPDFLoader(file)
78
- pages = loader.load_and_split()
79
- text_splitter = TokenTextSplitter(chunk_size=1000, chunk_overlap=0)
80
- split_pages = text_splitter.split_documents(pages)
81
 
82
- persist_directory = "./embeddings"
83
- embeddings = OpenAIEmbeddings()
84
- vectordb = Chroma.from_documents(split_pages, embeddings, persist_directory=persist_directory)
85
- vectordb.persist()
86
-
87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
 
90
  history = state['messages']
@@ -111,16 +152,27 @@ def submit_message(prompt, prompt_template, temperature, max_tokens, context_len
111
  #query = str(system_prompt + history[-context_length*2:] + [prompt_msg])
112
  #completion = completion.run(query)
113
  # from https://blog.devgenius.io/chat-with-document-s-using-openai-chatgpt-api-and-text-embedding-6a0ce3dc8bc8
114
- completion_chain = load_qa_chain(ChatOpenAI(temperature=temperature, max_tokens=max_tokens, model_name="gpt-3.5-turbo"), chain_type="stuff" )
115
- completion = RetrievalQA(combine_documents_chain=completion_chain, retriever=vectordb.as_retriever(), return_source_documents=False)
116
  #completion = RetrievalQA.from_chain_type(llm=ChatOpenAI(temperature=temperature, max_tokens=max_tokens, model_name="gpt-3.5-turbo"), chain_type="stuff", retriever=vectordb.as_retriever(), return_source_documents=True)
117
- query = str(system_prompt + history[-context_length*2:] + [prompt_msg])
118
  #completion = completion({"query": query})
119
- completion = completion.run(query)
120
 
121
  # completion = completion({"question": query, "chat_history": history[-context_length*2:]})
122
 
 
 
 
 
 
 
 
123
 
 
 
 
 
124
  # VectorDBQA.from_chain_type(llm=OpenAI(), chain_type="stuff", vectorstore=docsearch, return_source_documents=True)
125
  # https://colab.research.google.com/drive/1dzdNDZyofRB0f2KIB4gHXmIza7ehMX30?usp=sharing#scrollTo=b-ejDn_JfpWW
126
 
 
7
  import chromadb
8
  import glob
9
 
10
+ import pickle
11
+
12
+ from PyPDF2 import PdfReader
13
+ from PyPDF2 import PdfWriter
14
+ from langchain.embeddings.openai import OpenAIEmbeddings
15
+ from langchain.text_splitter import CharacterTextSplitter
16
+ from langchain.vectorstores import ElasticVectorSearch, Pinecone, Weaviate, FAISS
17
+ from langchain.chains.question_answering import load_qa_chain
18
+ from langchain.llms import OpenAI
19
 
20
  from langchain.embeddings.openai import OpenAIEmbeddings
21
  from langchain.vectorstores import Chroma
 
37
 
38
 
39
 
40
+
41
+
42
+
43
  def get_empty_state():
44
  return {"total_tokens": 0, "messages": []}
45
 
 
81
  os.environ["OPENAI_API_KEY"] = os.environ['openai_key']
82
 
83
  # load in all the files
84
+ #path = './files'
85
+ #pdf_files = glob.glob(os.path.join(path, "*.pdf"))
86
  #pdf_files = glob.glob(os.path.join(path, "*.pdf"))
 
87
 
88
+ #for file in pdf_files:
89
+ # loader = PyPDFLoader(file)
90
+ # pages = loader.load_and_split()
91
+ # text_splitter = TokenTextSplitter(chunk_size=1000, chunk_overlap=0)
92
+ # split_pages = text_splitter.split_documents(pages)
93
 
94
+ #persist_directory = "./embeddings"
95
+ #embeddings = OpenAIEmbeddings()
96
+ #vectordb = Chroma.from_documents(split_pages, embeddings, persist_directory=persist_directory)
97
+ #vectordb.persist()
 
98
 
99
+ path = './files'
100
+ pdf_files = glob.glob(os.path.join(path, "*.pdf"))
101
+
102
+ merger = PdfWriter()
103
+
104
+ # add all file in the list to the merger object
105
+ for pdf in pdf_files:
106
+ merger.append(pdf)
107
+ merger.write("merged-pdf.pdf")
108
+ merger.close()
109
+
110
+ reader = PdfReader("merged-pdf.pdf")
111
+ raw_text = ''
112
+ for i, page in enumerate(reader.pages):
113
+ text = page.extract_text()
114
+ if text:
115
+ raw_text += text
116
+ text_splitter = CharacterTextSplitter(
117
+ separator = "\n",
118
+ chunk_size = 1000,
119
+ chunk_overlap = 200,
120
+ length_function = len,
121
+ )
122
+ texts = text_splitter.split_text(raw_text)
123
+ len(texts)
124
+ embeddings = OpenAIEmbeddings()
125
+
126
+
127
+ with open("foo.pkl", 'wb') as f:
128
+ pickle.dump(embeddings, f)
129
 
130
 
131
  history = state['messages']
 
152
  #query = str(system_prompt + history[-context_length*2:] + [prompt_msg])
153
  #completion = completion.run(query)
154
  # from https://blog.devgenius.io/chat-with-document-s-using-openai-chatgpt-api-and-text-embedding-6a0ce3dc8bc8
155
+ #completion_chain = load_qa_chain(ChatOpenAI(temperature=temperature, max_tokens=max_tokens, model_name="gpt-3.5-turbo"), chain_type="stuff" )
156
+ #completion = RetrievalQA(combine_documents_chain=completion_chain, retriever=vectordb.as_retriever(), return_source_documents=False)
157
  #completion = RetrievalQA.from_chain_type(llm=ChatOpenAI(temperature=temperature, max_tokens=max_tokens, model_name="gpt-3.5-turbo"), chain_type="stuff", retriever=vectordb.as_retriever(), return_source_documents=True)
158
+ #query = str(system_prompt + history[-context_length*2:] + [prompt_msg])
159
  #completion = completion({"query": query})
160
+ #completion = completion.run(query)
161
 
162
  # completion = completion({"question": query, "chat_history": history[-context_length*2:]})
163
 
164
+ with open("foo.pkl", 'rb') as f:
165
+ new_docsearch = pickle.load(f)
166
+
167
+ docsearch = FAISS.from_texts(texts, new_docsearch)
168
+ query = str(system_prompt + history[-context_length*2:] + [prompt_msg])
169
+ docs = docsearch.similarity_search(query)
170
+ #print(docs[0].page_content)
171
 
172
+ chain = load_qa_chain(ChatOpenAI(temperature=temperature, max_tokens=max_tokens, model_name="gpt-3.5-turbo"), chain_type="stuff")
173
+ completion = chain.run(input_documents=docs, question=query)
174
+
175
+
176
  # VectorDBQA.from_chain_type(llm=OpenAI(), chain_type="stuff", vectorstore=docsearch, return_source_documents=True)
177
  # https://colab.research.google.com/drive/1dzdNDZyofRB0f2KIB4gHXmIza7ehMX30?usp=sharing#scrollTo=b-ejDn_JfpWW
178