Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,297 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from flask import Flask, render_template, request, redirect, send_file
|
2 |
+
from langchain.llms import HuggingFaceHub
|
3 |
+
from langchain.vectorstores import Chroma
|
4 |
+
from langchain.chains import RetrievalQA
|
5 |
+
import os
|
6 |
+
import sys
|
7 |
+
from langchain.embeddings import HuggingFaceBgeEmbeddings
|
8 |
+
from langchain.embeddings import HuggingFaceEmbeddings
|
9 |
+
from langchain.document_loaders import TextLoader
|
10 |
+
from pypdf import PdfReader
|
11 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
12 |
+
from langchain.schema.document import Document
|
13 |
+
import json
|
14 |
+
import re
|
15 |
+
import random
|
16 |
+
import spacy
|
17 |
+
|
18 |
+
app = Flask(__name__)
|
19 |
+
|
20 |
+
#global redact
|
21 |
+
#redact = False
|
22 |
+
|
23 |
+
global isServer
|
24 |
+
isServer = True
|
25 |
+
|
26 |
+
global baseFilePath
|
27 |
+
global jsonPath
|
28 |
+
|
29 |
+
if isServer:
|
30 |
+
baseFilePath = "/data/"
|
31 |
+
jsonPath = baseFilePath + "keyvalues/redacted.json"
|
32 |
+
else:
|
33 |
+
baseFilePath = "./"
|
34 |
+
jsonPath = baseFilePath + "keyvalues/redacted.json"
|
35 |
+
access_token = os.environ.get("ACCESS_TOKEN")
|
36 |
+
|
37 |
+
lastnames = ["Smith", "Johnson", "Williams", "Jones", "Brown", "Davis", "Miller", "Wilson", "Moore", "Taylor", "Anderson", "Thomas", "Jackson", "White", "Harris", "Martin", "Thompson", "Garcia", "Martinez", "Robinson", "Clark", "Rodriguez", "Lewis", "Lee", "Walker", "Hall", "Allen", "Young", "Hernandez", "King", "Wright", "Lopez", "Hill", "Scott", "Green", "Adams", "Baker", "Gonzalez", "Nelson", "Carter", "Mitchell", "Perez", "Roberts", "Turner", "Phillips", "Campbell", "Parker", "Evans", "Edwards", "Collins", "Stewart", "Sanchez", "Morris", "Rogers", "Reed", "Cook", "Morgan", "Bell", "Murphy", "Bailey", "Rivera", "Cooper", "Richardson", "Cox", "Howard", "Ward", "Torres", "Peterson", "Gray", "Ramirez", "James", "Watson", "Brooks", "Kelly", "Sanders", "Price", "Bennett", "Wood", "Barnes", "Ross", "Henderson", "Coleman", "Jenkins", "Perry", "Powell", "Long", "Patterson", "Hughes", "Flores", "Washington", "Butler", "Simmons", "Foster", "Gonzales", "Bryant", "Alexander", "Russell", "Griffin", "Diaz", "Hayes"]
|
38 |
+
|
39 |
+
def generateName():
|
40 |
+
return names[random.randint(0, len(names)-1)].title() + " " + lastnames[random.randint(0, len(lastnames)-1)]
|
41 |
+
|
42 |
+
def valueInJSON(value, key):
|
43 |
+
try:
|
44 |
+
if data[key][value] != "":
|
45 |
+
return data[key][value]
|
46 |
+
except KeyError:
|
47 |
+
return ""
|
48 |
+
|
49 |
+
if not os.path.exists(jsonPath):
|
50 |
+
with open(jsonPath, 'w') as file:
|
51 |
+
json.dump({"names": {}, "addresses": {}, "companyNames": {}, "phoneNumbers": {}, "emails": {}}, file, indent=2)
|
52 |
+
|
53 |
+
with open(jsonPath, 'r') as file:
|
54 |
+
data = json.load(file)
|
55 |
+
|
56 |
+
with open('names.txt', 'r') as file:
|
57 |
+
names = file.read().splitlines()
|
58 |
+
names = [x.lower() for x in names]
|
59 |
+
|
60 |
+
#with open('addresses.txt', 'r') as file:
|
61 |
+
# addresses = file.read().splitlines()
|
62 |
+
|
63 |
+
#directory make if not exist
|
64 |
+
os.makedirs(baseFilePath + "documents/", exist_ok=True)
|
65 |
+
os.makedirs(baseFilePath + "text/", exist_ok=True)
|
66 |
+
os.makedirs(baseFilePath + "redacted/", exist_ok=True)
|
67 |
+
os.makedirs(baseFilePath + "chroma_db/", exist_ok=True)
|
68 |
+
os.makedirs(baseFilePath + "keyvalues/", exist_ok=True)
|
69 |
+
|
70 |
+
def redactDocument(filepath):
|
71 |
+
#TAKES A DOCUMENT AND REDACTS SENSITIVE INFO SUCH AS NAMES, ADDRESSES, PHONE NUMBERS, EMAILS, ETC.
|
72 |
+
file = open(filepath, "r")
|
73 |
+
filename = filepath.split("/")[-1].split(".")[0]
|
74 |
+
file = file.readlines()
|
75 |
+
text = ""
|
76 |
+
for line in file:
|
77 |
+
text += line
|
78 |
+
lineOfText = NER(line)
|
79 |
+
#NAMES
|
80 |
+
for word in lineOfText.ents:
|
81 |
+
if word.label_ == "PERSON" and " " in word.text and word.text.lower().split(' ')[0] in names:
|
82 |
+
inJson = valueInJSON(word.text, "names")
|
83 |
+
if inJson != "":
|
84 |
+
fakeName = inJson
|
85 |
+
else:
|
86 |
+
fakeName = generateName()
|
87 |
+
data['names'][word.text] = fakeName
|
88 |
+
text = text.replace(word.text, fakeName)
|
89 |
+
text = text.replace(word.text+"'s", fakeName+"'s")
|
90 |
+
text = text.replace(word.text+"'", fakeName+"'")
|
91 |
+
text = text.replace(word.text.split(' ')[1], fakeName.split(' ')[1])
|
92 |
+
else:
|
93 |
+
pass
|
94 |
+
#EMAIL
|
95 |
+
#if re.search(r'\S+@\S+', line):
|
96 |
+
# for i in re.findall(r'\S+@\S+', line):
|
97 |
+
# if i in data['emails']:
|
98 |
+
# fakeEmail = data['emails'][i]
|
99 |
+
# else:
|
100 |
+
# emailProviders = ["gmail.com", "yahoo.com", "outlook.com", "hotmail.com", "aol.com", "icloud.com", "protonmail.com"]
|
101 |
+
# fakeEmail = os.urandom(10).hex() + emailProviders[random.randint(0, len(emailProviders)-1)]
|
102 |
+
# data['emails'][i] = fakeEmail
|
103 |
+
# text = text.replace(i, fakeEmail)
|
104 |
+
|
105 |
+
txtFile = baseFilePath + "redacted/" + filename + ".txt"
|
106 |
+
with open(txtFile, "w+") as f:
|
107 |
+
f.write(text)
|
108 |
+
return text
|
109 |
+
|
110 |
+
global isFirst
|
111 |
+
isFirst = True
|
112 |
+
global history
|
113 |
+
history = [("", "")]
|
114 |
+
|
115 |
+
global embeddings
|
116 |
+
if isServer:
|
117 |
+
embeddings = HuggingFaceEmbeddings()
|
118 |
+
else:
|
119 |
+
model = "BAAI/bge-base-en-v1.5"
|
120 |
+
encode_kwargs = {
|
121 |
+
"normalize_embeddings": True
|
122 |
+
}
|
123 |
+
embeddings = HuggingFaceBgeEmbeddings(
|
124 |
+
model_name=model, encode_kwargs=encode_kwargs, model_kwargs={"device": "cpu"}
|
125 |
+
)
|
126 |
+
|
127 |
+
def hideOutput():
|
128 |
+
sys.stdout = open(os.devnull, 'w')
|
129 |
+
sys.stderr = open(os.devnull, 'w')
|
130 |
+
|
131 |
+
def showOutput():
|
132 |
+
sys.stdout = sys.__stdout__
|
133 |
+
sys.stderr = sys.__stderr__
|
134 |
+
|
135 |
+
def prepareOnlineLLM():
|
136 |
+
#PREPARES CHROMA DB AND ACCESSES THE MIXTRAL LLM
|
137 |
+
db = Chroma(persist_directory=baseFilePath + "chroma_db", embedding_function=embeddings)
|
138 |
+
retriever = db.as_retriever()
|
139 |
+
if isServer:
|
140 |
+
llm = HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1", model_kwargs={"temperature": 0.1, "max_new_tokens": 750})
|
141 |
+
else:
|
142 |
+
llm = HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1", model_kwargs={"temperature": 0.1, "max_new_tokens": 750},huggingfacehub_api_token=access_token)
|
143 |
+
print(retriever)
|
144 |
+
global qa
|
145 |
+
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True)
|
146 |
+
|
147 |
+
def question(history, text):
|
148 |
+
global isFirst
|
149 |
+
if isFirst:
|
150 |
+
prepareOnlineLLM()
|
151 |
+
isFirst = False
|
152 |
+
|
153 |
+
with open(jsonPath, 'r') as file:
|
154 |
+
jsonValues = json.load(file)
|
155 |
+
|
156 |
+
#REDACTING SENSITIVE INFO IN REQUEST
|
157 |
+
for key in jsonValues:
|
158 |
+
for value in jsonValues[key]:
|
159 |
+
if value in text:
|
160 |
+
text = text.replace(value, jsonValues[key][value])
|
161 |
+
if value.lower() in text:
|
162 |
+
text = text.replace(value.lower(), jsonValues[key][value])
|
163 |
+
|
164 |
+
query = "You are a helpful assistant. Generate responses exclusively from the information contained in the documents. In the event that a user inquiry seeks information not explicitly stated in the documents, refrain from providing an answer. Exercise precision by relying solely on the information explicitly presented in the documents; avoid making inferences, assumptions, or speculations beyond what is explicitly mentioned. User Prompt: " + text
|
165 |
+
result = qa({"query": query})
|
166 |
+
history.append((text, result['result']))
|
167 |
+
resultValue = result['result']
|
168 |
+
print(resultValue)
|
169 |
+
|
170 |
+
#UNREDACTING THE RESULT
|
171 |
+
for key in jsonValues:
|
172 |
+
for value in jsonValues[key]:
|
173 |
+
resultValue = resultValue.replace(jsonValues[key][value], value)
|
174 |
+
|
175 |
+
return resultValue
|
176 |
+
|
177 |
+
def extractText(file):
|
178 |
+
#TAKING A PDF FILE AND CONVERTING IT TO A .TXT IN THE "TEXT" FOLDER
|
179 |
+
reader = PdfReader(file)
|
180 |
+
filename = os.path.splitext(os.path.basename(file))[0]
|
181 |
+
text = ""
|
182 |
+
for page in reader.pages:
|
183 |
+
text += page.extract_text() + "\n"
|
184 |
+
txtFile = baseFilePath + "text/" + filename + ".txt"
|
185 |
+
with open(txtFile, "w+") as f:
|
186 |
+
#f.write(re.sub(r'\s+', ' ', text))
|
187 |
+
f.write(text)
|
188 |
+
redactDocument(txtFile)
|
189 |
+
print(data)
|
190 |
+
with open(jsonPath, 'w') as file:
|
191 |
+
json.dump(data, file, indent=2)
|
192 |
+
|
193 |
+
def newFile(files, filepaths):
|
194 |
+
count = 0
|
195 |
+
for file in files:
|
196 |
+
print("Processing: " + filepaths[count].split("/")[-1])
|
197 |
+
if filepaths[count].split(".")[-1] == "pdf":
|
198 |
+
#EXTRACTING TEXT AND PROCESSING PDF
|
199 |
+
extractText(filepaths[count])
|
200 |
+
elif filepaths[count].split(".")[-1] == "txt":
|
201 |
+
#CREATING .TXT FILE BY SAVING THE UPLOADED FILE
|
202 |
+
filename = filepaths[count].split("/")[-1].split(".")[0]
|
203 |
+
documentPath = baseFilePath + "documents/" + filename + ".txt"
|
204 |
+
with open(documentPath, "w+") as f:
|
205 |
+
textToCopy = "\n".join(f.readlines())
|
206 |
+
saveFile = baseFilePath + "text/" + filename + ".txt"
|
207 |
+
with open(saveFile, "w+") as f:
|
208 |
+
f.write(textToCopy)
|
209 |
+
|
210 |
+
redactDocument(saveFile)
|
211 |
+
with open(jsonPath, 'w') as file:
|
212 |
+
json.dump(data, file, indent=2)
|
213 |
+
else:
|
214 |
+
return "Error: File type not supported"
|
215 |
+
redactedFile = filepaths[count].split("/")[-1].split(".")[0]
|
216 |
+
redactedFile = baseFilePath + "redacted/" + redactedFile + ".txt"
|
217 |
+
with open(redactedFile, 'r') as f:
|
218 |
+
fileText = f.read()
|
219 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
220 |
+
chunk_size=1000, chunk_overlap=0, separators=[" ", ",", "\n"]
|
221 |
+
)
|
222 |
+
embeddings = HuggingFaceEmbeddings()
|
223 |
+
#STORES TO CHROMA DB
|
224 |
+
docs = [Document(page_content=x) for x in text_splitter.split_text(fileText)]
|
225 |
+
db = Chroma.from_documents(docs, embeddings, persist_directory= baseFilePath + "chroma_db")
|
226 |
+
print("Done processing: " + filepaths[count].split("/")[-1])
|
227 |
+
count = count + 1
|
228 |
+
|
229 |
+
@app.route('/', methods=['GET', 'POST'])
|
230 |
+
def chat():
|
231 |
+
if request.method == 'POST':
|
232 |
+
#HANDLES FILE UPLOADS
|
233 |
+
global NER
|
234 |
+
NER = spacy.load("en_core_web_lg")
|
235 |
+
files = request.files.getlist('pdf-files[]')
|
236 |
+
filenames = []
|
237 |
+
for file in files:
|
238 |
+
filenames.append(file.filename)
|
239 |
+
filepaths = []
|
240 |
+
documents_directory = baseFilePath + "documents/"
|
241 |
+
os.makedirs(documents_directory, exist_ok=True)
|
242 |
+
count = 0
|
243 |
+
for file in files:
|
244 |
+
filepath = os.path.join(documents_directory, filenames[count])
|
245 |
+
#make it work for pdf and txt files
|
246 |
+
if filepath.split(".")[-1] == "pdf":
|
247 |
+
with open(filepath, 'wb') as f:
|
248 |
+
f.write(file.read())
|
249 |
+
elif filepath.split(".")[-1] == "txt":
|
250 |
+
#CREATING .TXT FILE BY SAVING THE UPLOADED FILE
|
251 |
+
print("txt")
|
252 |
+
filepaths.append(filepath)
|
253 |
+
count = count + 1
|
254 |
+
newFile(files, filepaths)
|
255 |
+
return "Success"
|
256 |
+
#MAIN PAGE LOAD
|
257 |
+
documents_directory = baseFilePath + "documents/"
|
258 |
+
documents = os.listdir(documents_directory)
|
259 |
+
return render_template('chat.html', history=[("", "")], documents=documents)
|
260 |
+
|
261 |
+
@app.route('/chat', methods=['GET'])
|
262 |
+
def askQuestion():
|
263 |
+
#PROCESSING USER QUESTIONS
|
264 |
+
text = request.args.get('message')
|
265 |
+
display = question(history, text)
|
266 |
+
return display
|
267 |
+
|
268 |
+
@app.route('/document', methods=['GET'])
|
269 |
+
def document():
|
270 |
+
#RETURNS DOCUMENTS
|
271 |
+
name = request.args.get('name')
|
272 |
+
path = os.path.join("documents", name)
|
273 |
+
return send_file(path)
|
274 |
+
|
275 |
+
@app.route('/clear', methods=['GET', 'POST'])
|
276 |
+
def clear():
|
277 |
+
#CLEARS ALL FILES
|
278 |
+
documents_directory = baseFilePath + "documents/"
|
279 |
+
documents = os.listdir(documents_directory)
|
280 |
+
for document in documents:
|
281 |
+
os.system("rm -rf " + os.path.join(documents_directory, document))
|
282 |
+
documents_directory = baseFilePath + "text/"
|
283 |
+
documents = os.listdir(documents_directory)
|
284 |
+
for document in documents:
|
285 |
+
os.system("rm -rf " + os.path.join(documents_directory, document))
|
286 |
+
documents_directory = baseFilePath + "redacted/"
|
287 |
+
documents = os.listdir(documents_directory)
|
288 |
+
for document in documents:
|
289 |
+
os.system("rm -rf " + os.path.join(documents_directory, document))
|
290 |
+
chroma_directory = baseFilePath + "chroma_db/"
|
291 |
+
os.system("rm -rf " + chroma_directory)
|
292 |
+
with open(jsonPath, 'w') as file:
|
293 |
+
json.dump({"names": {}, "addresses": {}, "companyNames": {}, "phoneNumbers": {}, "emails": {}}, file, indent=2)
|
294 |
+
return redirect('/')
|
295 |
+
|
296 |
+
if __name__ == '__main__':
|
297 |
+
app.run(debug=True)
|