LiamDowd commited on
Commit
83851a0
1 Parent(s): 2d28517

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +297 -0
app.py ADDED
@@ -0,0 +1,297 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, render_template, request, redirect, send_file
2
+ from langchain.llms import HuggingFaceHub
3
+ from langchain.vectorstores import Chroma
4
+ from langchain.chains import RetrievalQA
5
+ import os
6
+ import sys
7
+ from langchain.embeddings import HuggingFaceBgeEmbeddings
8
+ from langchain.embeddings import HuggingFaceEmbeddings
9
+ from langchain.document_loaders import TextLoader
10
+ from pypdf import PdfReader
11
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
12
+ from langchain.schema.document import Document
13
+ import json
14
+ import re
15
+ import random
16
+ import spacy
17
+
18
+ app = Flask(__name__)
19
+
20
+ #global redact
21
+ #redact = False
22
+
23
+ global isServer
24
+ isServer = True
25
+
26
+ global baseFilePath
27
+ global jsonPath
28
+
29
+ if isServer:
30
+ baseFilePath = "/data/"
31
+ jsonPath = baseFilePath + "keyvalues/redacted.json"
32
+ else:
33
+ baseFilePath = "./"
34
+ jsonPath = baseFilePath + "keyvalues/redacted.json"
35
+ access_token = os.environ.get("ACCESS_TOKEN")
36
+
37
+ lastnames = ["Smith", "Johnson", "Williams", "Jones", "Brown", "Davis", "Miller", "Wilson", "Moore", "Taylor", "Anderson", "Thomas", "Jackson", "White", "Harris", "Martin", "Thompson", "Garcia", "Martinez", "Robinson", "Clark", "Rodriguez", "Lewis", "Lee", "Walker", "Hall", "Allen", "Young", "Hernandez", "King", "Wright", "Lopez", "Hill", "Scott", "Green", "Adams", "Baker", "Gonzalez", "Nelson", "Carter", "Mitchell", "Perez", "Roberts", "Turner", "Phillips", "Campbell", "Parker", "Evans", "Edwards", "Collins", "Stewart", "Sanchez", "Morris", "Rogers", "Reed", "Cook", "Morgan", "Bell", "Murphy", "Bailey", "Rivera", "Cooper", "Richardson", "Cox", "Howard", "Ward", "Torres", "Peterson", "Gray", "Ramirez", "James", "Watson", "Brooks", "Kelly", "Sanders", "Price", "Bennett", "Wood", "Barnes", "Ross", "Henderson", "Coleman", "Jenkins", "Perry", "Powell", "Long", "Patterson", "Hughes", "Flores", "Washington", "Butler", "Simmons", "Foster", "Gonzales", "Bryant", "Alexander", "Russell", "Griffin", "Diaz", "Hayes"]
38
+
39
+ def generateName():
40
+ return names[random.randint(0, len(names)-1)].title() + " " + lastnames[random.randint(0, len(lastnames)-1)]
41
+
42
+ def valueInJSON(value, key):
43
+ try:
44
+ if data[key][value] != "":
45
+ return data[key][value]
46
+ except KeyError:
47
+ return ""
48
+
49
+ if not os.path.exists(jsonPath):
50
+ with open(jsonPath, 'w') as file:
51
+ json.dump({"names": {}, "addresses": {}, "companyNames": {}, "phoneNumbers": {}, "emails": {}}, file, indent=2)
52
+
53
+ with open(jsonPath, 'r') as file:
54
+ data = json.load(file)
55
+
56
+ with open('names.txt', 'r') as file:
57
+ names = file.read().splitlines()
58
+ names = [x.lower() for x in names]
59
+
60
+ #with open('addresses.txt', 'r') as file:
61
+ # addresses = file.read().splitlines()
62
+
63
+ #directory make if not exist
64
+ os.makedirs(baseFilePath + "documents/", exist_ok=True)
65
+ os.makedirs(baseFilePath + "text/", exist_ok=True)
66
+ os.makedirs(baseFilePath + "redacted/", exist_ok=True)
67
+ os.makedirs(baseFilePath + "chroma_db/", exist_ok=True)
68
+ os.makedirs(baseFilePath + "keyvalues/", exist_ok=True)
69
+
70
+ def redactDocument(filepath):
71
+ #TAKES A DOCUMENT AND REDACTS SENSITIVE INFO SUCH AS NAMES, ADDRESSES, PHONE NUMBERS, EMAILS, ETC.
72
+ file = open(filepath, "r")
73
+ filename = filepath.split("/")[-1].split(".")[0]
74
+ file = file.readlines()
75
+ text = ""
76
+ for line in file:
77
+ text += line
78
+ lineOfText = NER(line)
79
+ #NAMES
80
+ for word in lineOfText.ents:
81
+ if word.label_ == "PERSON" and " " in word.text and word.text.lower().split(' ')[0] in names:
82
+ inJson = valueInJSON(word.text, "names")
83
+ if inJson != "":
84
+ fakeName = inJson
85
+ else:
86
+ fakeName = generateName()
87
+ data['names'][word.text] = fakeName
88
+ text = text.replace(word.text, fakeName)
89
+ text = text.replace(word.text+"'s", fakeName+"'s")
90
+ text = text.replace(word.text+"'", fakeName+"'")
91
+ text = text.replace(word.text.split(' ')[1], fakeName.split(' ')[1])
92
+ else:
93
+ pass
94
+ #EMAIL
95
+ #if re.search(r'\S+@\S+', line):
96
+ # for i in re.findall(r'\S+@\S+', line):
97
+ # if i in data['emails']:
98
+ # fakeEmail = data['emails'][i]
99
+ # else:
100
+ # emailProviders = ["gmail.com", "yahoo.com", "outlook.com", "hotmail.com", "aol.com", "icloud.com", "protonmail.com"]
101
+ # fakeEmail = os.urandom(10).hex() + emailProviders[random.randint(0, len(emailProviders)-1)]
102
+ # data['emails'][i] = fakeEmail
103
+ # text = text.replace(i, fakeEmail)
104
+
105
+ txtFile = baseFilePath + "redacted/" + filename + ".txt"
106
+ with open(txtFile, "w+") as f:
107
+ f.write(text)
108
+ return text
109
+
110
+ global isFirst
111
+ isFirst = True
112
+ global history
113
+ history = [("", "")]
114
+
115
+ global embeddings
116
+ if isServer:
117
+ embeddings = HuggingFaceEmbeddings()
118
+ else:
119
+ model = "BAAI/bge-base-en-v1.5"
120
+ encode_kwargs = {
121
+ "normalize_embeddings": True
122
+ }
123
+ embeddings = HuggingFaceBgeEmbeddings(
124
+ model_name=model, encode_kwargs=encode_kwargs, model_kwargs={"device": "cpu"}
125
+ )
126
+
127
+ def hideOutput():
128
+ sys.stdout = open(os.devnull, 'w')
129
+ sys.stderr = open(os.devnull, 'w')
130
+
131
+ def showOutput():
132
+ sys.stdout = sys.__stdout__
133
+ sys.stderr = sys.__stderr__
134
+
135
+ def prepareOnlineLLM():
136
+ #PREPARES CHROMA DB AND ACCESSES THE MIXTRAL LLM
137
+ db = Chroma(persist_directory=baseFilePath + "chroma_db", embedding_function=embeddings)
138
+ retriever = db.as_retriever()
139
+ if isServer:
140
+ llm = HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1", model_kwargs={"temperature": 0.1, "max_new_tokens": 750})
141
+ else:
142
+ llm = HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1", model_kwargs={"temperature": 0.1, "max_new_tokens": 750},huggingfacehub_api_token=access_token)
143
+ print(retriever)
144
+ global qa
145
+ qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True)
146
+
147
+ def question(history, text):
148
+ global isFirst
149
+ if isFirst:
150
+ prepareOnlineLLM()
151
+ isFirst = False
152
+
153
+ with open(jsonPath, 'r') as file:
154
+ jsonValues = json.load(file)
155
+
156
+ #REDACTING SENSITIVE INFO IN REQUEST
157
+ for key in jsonValues:
158
+ for value in jsonValues[key]:
159
+ if value in text:
160
+ text = text.replace(value, jsonValues[key][value])
161
+ if value.lower() in text:
162
+ text = text.replace(value.lower(), jsonValues[key][value])
163
+
164
+ query = "You are a helpful assistant. Generate responses exclusively from the information contained in the documents. In the event that a user inquiry seeks information not explicitly stated in the documents, refrain from providing an answer. Exercise precision by relying solely on the information explicitly presented in the documents; avoid making inferences, assumptions, or speculations beyond what is explicitly mentioned. User Prompt: " + text
165
+ result = qa({"query": query})
166
+ history.append((text, result['result']))
167
+ resultValue = result['result']
168
+ print(resultValue)
169
+
170
+ #UNREDACTING THE RESULT
171
+ for key in jsonValues:
172
+ for value in jsonValues[key]:
173
+ resultValue = resultValue.replace(jsonValues[key][value], value)
174
+
175
+ return resultValue
176
+
177
+ def extractText(file):
178
+ #TAKING A PDF FILE AND CONVERTING IT TO A .TXT IN THE "TEXT" FOLDER
179
+ reader = PdfReader(file)
180
+ filename = os.path.splitext(os.path.basename(file))[0]
181
+ text = ""
182
+ for page in reader.pages:
183
+ text += page.extract_text() + "\n"
184
+ txtFile = baseFilePath + "text/" + filename + ".txt"
185
+ with open(txtFile, "w+") as f:
186
+ #f.write(re.sub(r'\s+', ' ', text))
187
+ f.write(text)
188
+ redactDocument(txtFile)
189
+ print(data)
190
+ with open(jsonPath, 'w') as file:
191
+ json.dump(data, file, indent=2)
192
+
193
+ def newFile(files, filepaths):
194
+ count = 0
195
+ for file in files:
196
+ print("Processing: " + filepaths[count].split("/")[-1])
197
+ if filepaths[count].split(".")[-1] == "pdf":
198
+ #EXTRACTING TEXT AND PROCESSING PDF
199
+ extractText(filepaths[count])
200
+ elif filepaths[count].split(".")[-1] == "txt":
201
+ #CREATING .TXT FILE BY SAVING THE UPLOADED FILE
202
+ filename = filepaths[count].split("/")[-1].split(".")[0]
203
+ documentPath = baseFilePath + "documents/" + filename + ".txt"
204
+ with open(documentPath, "w+") as f:
205
+ textToCopy = "\n".join(f.readlines())
206
+ saveFile = baseFilePath + "text/" + filename + ".txt"
207
+ with open(saveFile, "w+") as f:
208
+ f.write(textToCopy)
209
+
210
+ redactDocument(saveFile)
211
+ with open(jsonPath, 'w') as file:
212
+ json.dump(data, file, indent=2)
213
+ else:
214
+ return "Error: File type not supported"
215
+ redactedFile = filepaths[count].split("/")[-1].split(".")[0]
216
+ redactedFile = baseFilePath + "redacted/" + redactedFile + ".txt"
217
+ with open(redactedFile, 'r') as f:
218
+ fileText = f.read()
219
+ text_splitter = RecursiveCharacterTextSplitter(
220
+ chunk_size=1000, chunk_overlap=0, separators=[" ", ",", "\n"]
221
+ )
222
+ embeddings = HuggingFaceEmbeddings()
223
+ #STORES TO CHROMA DB
224
+ docs = [Document(page_content=x) for x in text_splitter.split_text(fileText)]
225
+ db = Chroma.from_documents(docs, embeddings, persist_directory= baseFilePath + "chroma_db")
226
+ print("Done processing: " + filepaths[count].split("/")[-1])
227
+ count = count + 1
228
+
229
+ @app.route('/', methods=['GET', 'POST'])
230
+ def chat():
231
+ if request.method == 'POST':
232
+ #HANDLES FILE UPLOADS
233
+ global NER
234
+ NER = spacy.load("en_core_web_lg")
235
+ files = request.files.getlist('pdf-files[]')
236
+ filenames = []
237
+ for file in files:
238
+ filenames.append(file.filename)
239
+ filepaths = []
240
+ documents_directory = baseFilePath + "documents/"
241
+ os.makedirs(documents_directory, exist_ok=True)
242
+ count = 0
243
+ for file in files:
244
+ filepath = os.path.join(documents_directory, filenames[count])
245
+ #make it work for pdf and txt files
246
+ if filepath.split(".")[-1] == "pdf":
247
+ with open(filepath, 'wb') as f:
248
+ f.write(file.read())
249
+ elif filepath.split(".")[-1] == "txt":
250
+ #CREATING .TXT FILE BY SAVING THE UPLOADED FILE
251
+ print("txt")
252
+ filepaths.append(filepath)
253
+ count = count + 1
254
+ newFile(files, filepaths)
255
+ return "Success"
256
+ #MAIN PAGE LOAD
257
+ documents_directory = baseFilePath + "documents/"
258
+ documents = os.listdir(documents_directory)
259
+ return render_template('chat.html', history=[("", "")], documents=documents)
260
+
261
+ @app.route('/chat', methods=['GET'])
262
+ def askQuestion():
263
+ #PROCESSING USER QUESTIONS
264
+ text = request.args.get('message')
265
+ display = question(history, text)
266
+ return display
267
+
268
+ @app.route('/document', methods=['GET'])
269
+ def document():
270
+ #RETURNS DOCUMENTS
271
+ name = request.args.get('name')
272
+ path = os.path.join("documents", name)
273
+ return send_file(path)
274
+
275
+ @app.route('/clear', methods=['GET', 'POST'])
276
+ def clear():
277
+ #CLEARS ALL FILES
278
+ documents_directory = baseFilePath + "documents/"
279
+ documents = os.listdir(documents_directory)
280
+ for document in documents:
281
+ os.system("rm -rf " + os.path.join(documents_directory, document))
282
+ documents_directory = baseFilePath + "text/"
283
+ documents = os.listdir(documents_directory)
284
+ for document in documents:
285
+ os.system("rm -rf " + os.path.join(documents_directory, document))
286
+ documents_directory = baseFilePath + "redacted/"
287
+ documents = os.listdir(documents_directory)
288
+ for document in documents:
289
+ os.system("rm -rf " + os.path.join(documents_directory, document))
290
+ chroma_directory = baseFilePath + "chroma_db/"
291
+ os.system("rm -rf " + chroma_directory)
292
+ with open(jsonPath, 'w') as file:
293
+ json.dump({"names": {}, "addresses": {}, "companyNames": {}, "phoneNumbers": {}, "emails": {}}, file, indent=2)
294
+ return redirect('/')
295
+
296
+ if __name__ == '__main__':
297
+ app.run(debug=True)