Spaces:
Running
Running
ryanrwatkins
commited on
Commit
•
5320c7c
1
Parent(s):
5cc5022
Update app.py
Browse files
app.py
CHANGED
@@ -7,6 +7,15 @@ import langchain
|
|
7 |
import chromadb
|
8 |
import glob
|
9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
from langchain.embeddings.openai import OpenAIEmbeddings
|
12 |
from langchain.vectorstores import Chroma
|
@@ -28,6 +37,9 @@ from langchain.chains.question_answering import load_qa_chain
|
|
28 |
|
29 |
|
30 |
|
|
|
|
|
|
|
31 |
def get_empty_state():
|
32 |
return {"total_tokens": 0, "messages": []}
|
33 |
|
@@ -69,22 +81,51 @@ def submit_message(prompt, prompt_template, temperature, max_tokens, context_len
|
|
69 |
os.environ["OPENAI_API_KEY"] = os.environ['openai_key']
|
70 |
|
71 |
# load in all the files
|
72 |
-
path = './files'
|
|
|
73 |
#pdf_files = glob.glob(os.path.join(path, "*.pdf"))
|
74 |
-
pdf_files = glob.glob(os.path.join(path, "*.pdf"))
|
75 |
|
76 |
-
for file in pdf_files:
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
|
82 |
-
persist_directory = "./embeddings"
|
83 |
-
embeddings = OpenAIEmbeddings()
|
84 |
-
vectordb = Chroma.from_documents(split_pages, embeddings, persist_directory=persist_directory)
|
85 |
-
vectordb.persist()
|
86 |
-
|
87 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
|
89 |
|
90 |
history = state['messages']
|
@@ -111,16 +152,27 @@ def submit_message(prompt, prompt_template, temperature, max_tokens, context_len
|
|
111 |
#query = str(system_prompt + history[-context_length*2:] + [prompt_msg])
|
112 |
#completion = completion.run(query)
|
113 |
# from https://blog.devgenius.io/chat-with-document-s-using-openai-chatgpt-api-and-text-embedding-6a0ce3dc8bc8
|
114 |
-
completion_chain = load_qa_chain(ChatOpenAI(temperature=temperature, max_tokens=max_tokens, model_name="gpt-3.5-turbo"), chain_type="stuff" )
|
115 |
-
completion = RetrievalQA(combine_documents_chain=completion_chain, retriever=vectordb.as_retriever(), return_source_documents=False)
|
116 |
#completion = RetrievalQA.from_chain_type(llm=ChatOpenAI(temperature=temperature, max_tokens=max_tokens, model_name="gpt-3.5-turbo"), chain_type="stuff", retriever=vectordb.as_retriever(), return_source_documents=True)
|
117 |
-
query = str(system_prompt + history[-context_length*2:] + [prompt_msg])
|
118 |
#completion = completion({"query": query})
|
119 |
-
completion = completion.run(query)
|
120 |
|
121 |
# completion = completion({"question": query, "chat_history": history[-context_length*2:]})
|
122 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
|
|
|
|
|
|
|
|
|
124 |
# VectorDBQA.from_chain_type(llm=OpenAI(), chain_type="stuff", vectorstore=docsearch, return_source_documents=True)
|
125 |
# https://colab.research.google.com/drive/1dzdNDZyofRB0f2KIB4gHXmIza7ehMX30?usp=sharing#scrollTo=b-ejDn_JfpWW
|
126 |
|
|
|
7 |
import chromadb
|
8 |
import glob
|
9 |
|
10 |
+
import pickle
|
11 |
+
|
12 |
+
from PyPDF2 import PdfReader
|
13 |
+
from PyPDF2 import PdfWriter
|
14 |
+
from langchain.embeddings.openai import OpenAIEmbeddings
|
15 |
+
from langchain.text_splitter import CharacterTextSplitter
|
16 |
+
from langchain.vectorstores import ElasticVectorSearch, Pinecone, Weaviate, FAISS
|
17 |
+
from langchain.chains.question_answering import load_qa_chain
|
18 |
+
from langchain.llms import OpenAI
|
19 |
|
20 |
from langchain.embeddings.openai import OpenAIEmbeddings
|
21 |
from langchain.vectorstores import Chroma
|
|
|
37 |
|
38 |
|
39 |
|
40 |
+
|
41 |
+
|
42 |
+
|
43 |
def get_empty_state():
|
44 |
return {"total_tokens": 0, "messages": []}
|
45 |
|
|
|
81 |
os.environ["OPENAI_API_KEY"] = os.environ['openai_key']
|
82 |
|
83 |
# load in all the files
|
84 |
+
#path = './files'
|
85 |
+
#pdf_files = glob.glob(os.path.join(path, "*.pdf"))
|
86 |
#pdf_files = glob.glob(os.path.join(path, "*.pdf"))
|
|
|
87 |
|
88 |
+
#for file in pdf_files:
|
89 |
+
# loader = PyPDFLoader(file)
|
90 |
+
# pages = loader.load_and_split()
|
91 |
+
# text_splitter = TokenTextSplitter(chunk_size=1000, chunk_overlap=0)
|
92 |
+
# split_pages = text_splitter.split_documents(pages)
|
93 |
|
94 |
+
#persist_directory = "./embeddings"
|
95 |
+
#embeddings = OpenAIEmbeddings()
|
96 |
+
#vectordb = Chroma.from_documents(split_pages, embeddings, persist_directory=persist_directory)
|
97 |
+
#vectordb.persist()
|
|
|
98 |
|
99 |
+
path = './files'
|
100 |
+
pdf_files = glob.glob(os.path.join(path, "*.pdf"))
|
101 |
+
|
102 |
+
merger = PdfWriter()
|
103 |
+
|
104 |
+
# add all file in the list to the merger object
|
105 |
+
for pdf in pdf_files:
|
106 |
+
merger.append(pdf)
|
107 |
+
merger.write("merged-pdf.pdf")
|
108 |
+
merger.close()
|
109 |
+
|
110 |
+
reader = PdfReader("merged-pdf.pdf")
|
111 |
+
raw_text = ''
|
112 |
+
for i, page in enumerate(reader.pages):
|
113 |
+
text = page.extract_text()
|
114 |
+
if text:
|
115 |
+
raw_text += text
|
116 |
+
text_splitter = CharacterTextSplitter(
|
117 |
+
separator = "\n",
|
118 |
+
chunk_size = 1000,
|
119 |
+
chunk_overlap = 200,
|
120 |
+
length_function = len,
|
121 |
+
)
|
122 |
+
texts = text_splitter.split_text(raw_text)
|
123 |
+
len(texts)
|
124 |
+
embeddings = OpenAIEmbeddings()
|
125 |
+
|
126 |
+
|
127 |
+
with open("foo.pkl", 'wb') as f:
|
128 |
+
pickle.dump(embeddings, f)
|
129 |
|
130 |
|
131 |
history = state['messages']
|
|
|
152 |
#query = str(system_prompt + history[-context_length*2:] + [prompt_msg])
|
153 |
#completion = completion.run(query)
|
154 |
# from https://blog.devgenius.io/chat-with-document-s-using-openai-chatgpt-api-and-text-embedding-6a0ce3dc8bc8
|
155 |
+
#completion_chain = load_qa_chain(ChatOpenAI(temperature=temperature, max_tokens=max_tokens, model_name="gpt-3.5-turbo"), chain_type="stuff" )
|
156 |
+
#completion = RetrievalQA(combine_documents_chain=completion_chain, retriever=vectordb.as_retriever(), return_source_documents=False)
|
157 |
#completion = RetrievalQA.from_chain_type(llm=ChatOpenAI(temperature=temperature, max_tokens=max_tokens, model_name="gpt-3.5-turbo"), chain_type="stuff", retriever=vectordb.as_retriever(), return_source_documents=True)
|
158 |
+
#query = str(system_prompt + history[-context_length*2:] + [prompt_msg])
|
159 |
#completion = completion({"query": query})
|
160 |
+
#completion = completion.run(query)
|
161 |
|
162 |
# completion = completion({"question": query, "chat_history": history[-context_length*2:]})
|
163 |
|
164 |
+
with open("foo.pkl", 'rb') as f:
|
165 |
+
new_docsearch = pickle.load(f)
|
166 |
+
|
167 |
+
docsearch = FAISS.from_texts(texts, new_docsearch)
|
168 |
+
query = str(system_prompt + history[-context_length*2:] + [prompt_msg])
|
169 |
+
docs = docsearch.similarity_search(query)
|
170 |
+
#print(docs[0].page_content)
|
171 |
|
172 |
+
chain = load_qa_chain(ChatOpenAI(temperature=temperature, max_tokens=max_tokens, model_name="gpt-3.5-turbo"), chain_type="stuff")
|
173 |
+
completion = chain.run(input_documents=docs, question=query)
|
174 |
+
|
175 |
+
|
176 |
# VectorDBQA.from_chain_type(llm=OpenAI(), chain_type="stuff", vectorstore=docsearch, return_source_documents=True)
|
177 |
# https://colab.research.google.com/drive/1dzdNDZyofRB0f2KIB4gHXmIza7ehMX30?usp=sharing#scrollTo=b-ejDn_JfpWW
|
178 |
|