Update main.py
Browse files
main.py
CHANGED
@@ -7,9 +7,10 @@ import sys
|
|
7 |
from langchain.embeddings import HuggingFaceBgeEmbeddings
|
8 |
from langchain.embeddings import HuggingFaceEmbeddings
|
9 |
from langchain.document_loaders import TextLoader
|
|
|
10 |
from pypdf import PdfReader
|
11 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
12 |
-
from langchain.
|
13 |
import json
|
14 |
import re
|
15 |
import random
|
@@ -17,11 +18,8 @@ import spacy
|
|
17 |
|
18 |
app = Flask(__name__)
|
19 |
|
20 |
-
#global redact
|
21 |
-
#redact = False
|
22 |
-
|
23 |
global isServer
|
24 |
-
isServer =
|
25 |
|
26 |
global baseFilePath
|
27 |
global jsonPath
|
@@ -115,13 +113,14 @@ global embeddings
|
|
115 |
if isServer:
|
116 |
embeddings = HuggingFaceEmbeddings()
|
117 |
else:
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
|
|
125 |
|
126 |
def hideOutput():
|
127 |
sys.stdout = open(os.devnull, 'w')
|
@@ -131,23 +130,22 @@ def showOutput():
|
|
131 |
sys.stdout = sys.__stdout__
|
132 |
sys.stderr = sys.__stderr__
|
133 |
|
134 |
-
def
|
135 |
#PREPARES CHROMA DB AND ACCESSES THE MIXTRAL LLM
|
136 |
db = Chroma(persist_directory=baseFilePath + "chroma_db", embedding_function=embeddings)
|
137 |
-
retriever = db.as_retriever()
|
138 |
if isServer:
|
139 |
-
llm = HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1", model_kwargs={"temperature": 0.1, "max_new_tokens":
|
140 |
else:
|
141 |
-
llm = HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1", model_kwargs={"temperature": 0.1, "max_new_tokens":
|
142 |
-
print(retriever)
|
143 |
global qa
|
144 |
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True)
|
145 |
|
146 |
def question(history, text):
|
147 |
-
global isFirst
|
148 |
-
if isFirst:
|
149 |
-
|
150 |
-
|
151 |
|
152 |
with open(jsonPath, 'r') as file:
|
153 |
jsonValues = json.load(file)
|
@@ -181,9 +179,14 @@ def extractText(file):
|
|
181 |
for page in reader.pages:
|
182 |
text += page.extract_text() + "\n"
|
183 |
txtFile = baseFilePath + "text/" + filename + ".txt"
|
|
|
|
|
184 |
with open(txtFile, "w+") as f:
|
185 |
#f.write(re.sub(r'\s+', ' ', text))
|
|
|
186 |
f.write(text)
|
|
|
|
|
187 |
redactDocument(txtFile)
|
188 |
print(data)
|
189 |
with open(jsonPath, 'w') as file:
|
@@ -193,40 +196,23 @@ def newFile(files, filepaths):
|
|
193 |
count = 0
|
194 |
for file in files:
|
195 |
print("Processing: " + filepaths[count].split("/")[-1])
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
elif filepaths[count].split(".")[-1] == "txt":
|
200 |
-
#CREATING .TXT FILE BY SAVING THE UPLOADED FILE
|
201 |
-
filename = filepaths[count].split("/")[-1].split(".")[0]
|
202 |
-
documentPath = baseFilePath + "documents/" + filename + ".txt"
|
203 |
-
with open(documentPath, "w+") as f:
|
204 |
-
textToCopy = "\n".join(f.readlines())
|
205 |
-
saveFile = baseFilePath + "text/" + filename + ".txt"
|
206 |
-
with open(saveFile, "w+") as f:
|
207 |
-
f.write(textToCopy)
|
208 |
-
|
209 |
-
redactDocument(saveFile)
|
210 |
-
with open(jsonPath, 'w') as file:
|
211 |
-
json.dump(data, file, indent=2)
|
212 |
-
else:
|
213 |
-
return "Error: File type not supported"
|
214 |
redactedFile = filepaths[count].split("/")[-1].split(".")[0]
|
|
|
|
|
215 |
redactedFile = baseFilePath + "redacted/" + redactedFile + ".txt"
|
216 |
-
|
217 |
-
|
|
|
218 |
text_splitter = RecursiveCharacterTextSplitter(
|
219 |
-
chunk_size=
|
220 |
)
|
221 |
-
texts = text_splitter.
|
222 |
-
doc = Document(page_content=texts, metadata={"source": "local"})
|
223 |
-
embeddings = HuggingFaceEmbeddings()
|
224 |
-
#STORES TO CHROMA DB
|
225 |
-
#docs = [Document(page_content=x) for x in text_splitter.split_text(fileText)]
|
226 |
-
#print(docs)
|
227 |
print(texts)
|
228 |
chromaDirectory = baseFilePath + "chroma_db"
|
229 |
-
|
230 |
print("Done processing: " + filepaths[count].split("/")[-1])
|
231 |
count = count + 1
|
232 |
|
@@ -246,13 +232,8 @@ def chat():
|
|
246 |
count = 0
|
247 |
for file in files:
|
248 |
filepath = os.path.join(documents_directory, filenames[count])
|
249 |
-
|
250 |
-
|
251 |
-
with open(filepath, 'wb') as f:
|
252 |
-
f.write(file.read())
|
253 |
-
elif filepath.split(".")[-1] == "txt":
|
254 |
-
#CREATING .TXT FILE BY SAVING THE UPLOADED FILE
|
255 |
-
print("txt")
|
256 |
filepaths.append(filepath)
|
257 |
count = count + 1
|
258 |
newFile(files, filepaths)
|
|
|
7 |
from langchain.embeddings import HuggingFaceBgeEmbeddings
|
8 |
from langchain.embeddings import HuggingFaceEmbeddings
|
9 |
from langchain.document_loaders import TextLoader
|
10 |
+
from langchain.document_loaders import OnlinePDFLoader
|
11 |
from pypdf import PdfReader
|
12 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
13 |
+
from langchain.text_splitter import CharacterTextSplitter
|
14 |
import json
|
15 |
import re
|
16 |
import random
|
|
|
18 |
|
19 |
app = Flask(__name__)
|
20 |
|
|
|
|
|
|
|
21 |
global isServer
|
22 |
+
isServer = False
|
23 |
|
24 |
global baseFilePath
|
25 |
global jsonPath
|
|
|
113 |
if isServer:
|
114 |
embeddings = HuggingFaceEmbeddings()
|
115 |
else:
|
116 |
+
embeddings = HuggingFaceEmbeddings()
|
117 |
+
#model = "BAAI/bge-base-en-v1.5"
|
118 |
+
#encode_kwargs = {
|
119 |
+
# "normalize_embeddings": True
|
120 |
+
#}
|
121 |
+
#embeddings = HuggingFaceBgeEmbeddings(
|
122 |
+
# model_name=model, encode_kwargs=encode_kwargs, model_kwargs={"device": "cpu"}
|
123 |
+
#)
|
124 |
|
125 |
def hideOutput():
|
126 |
sys.stdout = open(os.devnull, 'w')
|
|
|
130 |
sys.stdout = sys.__stdout__
|
131 |
sys.stderr = sys.__stderr__
|
132 |
|
133 |
+
def prepareLLM():
|
134 |
#PREPARES CHROMA DB AND ACCESSES THE MIXTRAL LLM
|
135 |
db = Chroma(persist_directory=baseFilePath + "chroma_db", embedding_function=embeddings)
|
136 |
+
retriever = db.as_retriever(search_kwargs={'k': 1})
|
137 |
if isServer:
|
138 |
+
llm = HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1", model_kwargs={"temperature": 0.1, "max_new_tokens": 700})
|
139 |
else:
|
140 |
+
llm = HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1", model_kwargs={"temperature": 0.1, "max_new_tokens": 700},huggingfacehub_api_token=access_token)
|
|
|
141 |
global qa
|
142 |
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True)
|
143 |
|
144 |
def question(history, text):
|
145 |
+
#global isFirst
|
146 |
+
#if isFirst:
|
147 |
+
prepareLLM()
|
148 |
+
# isFirst = False
|
149 |
|
150 |
with open(jsonPath, 'r') as file:
|
151 |
jsonValues = json.load(file)
|
|
|
179 |
for page in reader.pages:
|
180 |
text += page.extract_text() + "\n"
|
181 |
txtFile = baseFilePath + "text/" + filename + ".txt"
|
182 |
+
#with open(txtFile, "w+") as f:
|
183 |
+
#make utf 8
|
184 |
with open(txtFile, "w+") as f:
|
185 |
#f.write(re.sub(r'\s+', ' ', text))
|
186 |
+
#write text file in utf-8 format
|
187 |
f.write(text)
|
188 |
+
|
189 |
+
#f.write(text)
|
190 |
redactDocument(txtFile)
|
191 |
print(data)
|
192 |
with open(jsonPath, 'w') as file:
|
|
|
196 |
count = 0
|
197 |
for file in files:
|
198 |
print("Processing: " + filepaths[count].split("/")[-1])
|
199 |
+
#EXTRACTING TEXT AND PROCESSING PDF
|
200 |
+
extractText(filepaths[count])
|
201 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
202 |
redactedFile = filepaths[count].split("/")[-1].split(".")[0]
|
203 |
+
|
204 |
+
#redactedFile = baseFilePath + "redacted/" + redactedFile + ".txt"
|
205 |
redactedFile = baseFilePath + "redacted/" + redactedFile + ".txt"
|
206 |
+
|
207 |
+
loader = TextLoader(redactedFile, encoding='UTF-8')
|
208 |
+
documents = loader.load()
|
209 |
text_splitter = RecursiveCharacterTextSplitter(
|
210 |
+
chunk_size=300, chunk_overlap=0, separators=[" ", ",", "\n"]
|
211 |
)
|
212 |
+
texts = text_splitter.split_documents(documents)
|
|
|
|
|
|
|
|
|
|
|
213 |
print(texts)
|
214 |
chromaDirectory = baseFilePath + "chroma_db"
|
215 |
+
Chroma.from_documents(texts, embeddings, persist_directory=chromaDirectory)
|
216 |
print("Done processing: " + filepaths[count].split("/")[-1])
|
217 |
count = count + 1
|
218 |
|
|
|
232 |
count = 0
|
233 |
for file in files:
|
234 |
filepath = os.path.join(documents_directory, filenames[count])
|
235 |
+
with open(filepath, 'wb') as f:
|
236 |
+
f.write(file.read())
|
|
|
|
|
|
|
|
|
|
|
237 |
filepaths.append(filepath)
|
238 |
count = count + 1
|
239 |
newFile(files, filepaths)
|