Spaces:

LiamDowd
/

Redact

Sleeping

App Files Files Community

Redact / main.py

LiamDowd

Update main.py

fb6d9d1 7 months ago

raw

history blame contribute delete

11.6 kB

	from flask import Flask, render_template, request, redirect, send_file
	from langchain.llms import HuggingFaceHub
	from langchain.vectorstores import Chroma
	from langchain.chains import RetrievalQA
	import os
	import sys
	from langchain.embeddings import HuggingFaceBgeEmbeddings
	from langchain.embeddings import HuggingFaceEmbeddings
	from langchain.document_loaders import TextLoader
	from langchain.document_loaders import OnlinePDFLoader
	from pypdf import PdfReader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.text_splitter import CharacterTextSplitter
	import json
	import re
	import random
	import spacy
	import platform

	app = Flask(__name__)

	global isServer
	if platform.system() == "Darwin":
	isServer = False
	else:
	isServer = True

	global baseFilePath
	global jsonPath

	if isServer:
	baseFilePath = "/data/"
	jsonPath = baseFilePath + "keyvalues/redacted.json"
	else:
	baseFilePath = "./"
	jsonPath = baseFilePath + "keyvalues/redacted.json"
	access_token = os.environ.get("ACCESS_TOKEN")

	lastnames = ["Smith", "Johnson", "Williams", "Jones", "Brown", "Davis", "Miller", "Wilson", "Moore", "Taylor", "Anderson", "Thomas", "Jackson", "White", "Harris", "Martin", "Thompson", "Garcia", "Martinez", "Robinson", "Clark", "Rodriguez", "Lewis", "Lee", "Walker", "Hall", "Allen", "Young", "Hernandez", "King", "Wright", "Lopez", "Hill", "Scott", "Green", "Adams", "Baker", "Gonzalez", "Nelson", "Carter", "Mitchell", "Perez", "Roberts", "Turner", "Phillips", "Campbell", "Parker", "Evans", "Edwards", "Collins", "Stewart", "Sanchez", "Morris", "Rogers", "Reed", "Cook", "Morgan", "Bell", "Murphy", "Bailey", "Rivera", "Cooper", "Richardson", "Cox", "Howard", "Ward", "Torres", "Peterson", "Gray", "Ramirez", "James", "Watson", "Brooks", "Kelly", "Sanders", "Price", "Bennett", "Wood", "Barnes", "Ross", "Henderson", "Coleman", "Jenkins", "Perry", "Powell", "Long", "Patterson", "Hughes", "Flores", "Washington", "Butler", "Simmons", "Foster", "Gonzales", "Bryant", "Alexander", "Russell", "Griffin", "Diaz", "Hayes"]

	def generateName():
	return names[random.randint(0, len(names)-1)].title() + " " + lastnames[random.randint(0, len(lastnames)-1)]

	def valueInJSON(value, key):
	try:
	if data[key][value] != "":
	return data[key][value]
	except KeyError:
	return ""

	os.makedirs(baseFilePath + "documents/", exist_ok=True)
	os.makedirs(baseFilePath + "text/", exist_ok=True)
	os.makedirs(baseFilePath + "redacted/", exist_ok=True)
	os.makedirs(baseFilePath + "chroma_db/", exist_ok=True)
	os.makedirs(baseFilePath + "keyvalues/", exist_ok=True)

	if not os.path.exists(jsonPath):
	with open(jsonPath, 'w+') as file:
	json.dump({"names": {}, "addresses": {}, "companyNames": {}, "phoneNumbers": {}, "emails": {}}, file, indent=2)

	with open(jsonPath, 'r') as file:
	data = json.load(file)

	with open('names.txt', 'r') as file:
	names = file.read().splitlines()
	names = [x.lower() for x in names]

	#with open('addresses.txt', 'r') as file:
	# addresses = file.read().splitlines()

	def redactDocument(filepath):
	#TAKES A DOCUMENT AND REDACTS SENSITIVE INFO SUCH AS NAMES, ADDRESSES, PHONE NUMBERS, EMAILS, ETC.
	file = open(filepath, "r")
	filename = filepath.split("/")[-1].split(".")[0]
	file = file.readlines()
	text = ""
	for line in file:
	text += line
	lineOfText = NER(line)
	#NAMES
	for word in lineOfText.ents:
	if word.label_ == "PERSON" and " " in word.text and word.text.lower().split(' ')[0] in names:
	inJson = valueInJSON(word.text, "names")
	if inJson != "":
	fakeName = inJson
	else:
	fakeName = generateName()
	data['names'][word.text] = fakeName
	text = text.replace(word.text, fakeName)
	text = text.replace(word.text+"'s", fakeName+"'s")
	text = text.replace(word.text+"'", fakeName+"'")
	text = text.replace(word.text.split(' ')[1], fakeName.split(' ')[1])
	else:
	pass
	#EMAIL
	#if re.search(r'\S+@\S+', line):
	# for i in re.findall(r'\S+@\S+', line):
	# if i in data['emails']:
	# fakeEmail = data['emails'][i]
	# else:
	# emailProviders = ["gmail.com", "yahoo.com", "outlook.com", "hotmail.com", "aol.com", "icloud.com", "protonmail.com"]
	# fakeEmail = os.urandom(10).hex() + emailProviders[random.randint(0, len(emailProviders)-1)]
	# data['emails'][i] = fakeEmail
	# text = text.replace(i, fakeEmail)

	txtFile = baseFilePath + "redacted/" + filename + ".txt"
	with open(txtFile, "w+") as f:
	f.write(text)
	return text

	global isFirst
	isFirst = True
	global history
	history = [("", "")]

	global embeddings
	if isServer:
	embeddings = HuggingFaceEmbeddings()
	else:
	embeddings = HuggingFaceEmbeddings()
	#model = "BAAI/bge-base-en-v1.5"
	#encode_kwargs = {
	# "normalize_embeddings": True
	#}
	#embeddings = HuggingFaceBgeEmbeddings(
	# model_name=model, encode_kwargs=encode_kwargs, model_kwargs={"device": "cpu"}
	#)

	def hideOutput():
	sys.stdout = open(os.devnull, 'w')
	sys.stderr = open(os.devnull, 'w')

	def showOutput():
	sys.stdout = sys.__stdout__
	sys.stderr = sys.__stderr__

	def prepareLLM():
	#PREPARES CHROMA DB AND ACCESSES THE MIXTRAL LLM
	db = Chroma(persist_directory=baseFilePath + "chroma_db", embedding_function=embeddings)
	retriever = db.as_retriever(search_kwargs={'k': 1})
	if isServer:
	llm = HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1", model_kwargs={"temperature": 0.1, "max_new_tokens": 700})
	else:
	llm = HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1", model_kwargs={"temperature": 0.1, "max_new_tokens": 700},huggingfacehub_api_token=access_token)
	global qa
	qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True)

	def question(history, text):
	#global isFirst
	#if isFirst:
	prepareLLM()
	# isFirst = False

	with open(jsonPath, 'r') as file:
	jsonValues = json.load(file)

	#REDACTING SENSITIVE INFO IN REQUEST
	for key in jsonValues:
	for value in jsonValues[key]:
	if value in text:
	text = text.replace(value, jsonValues[key][value])
	if value.lower() in text:
	text = text.replace(value.lower(), jsonValues[key][value])

	query = "You are a helpful assistant. Generate responses exclusively from the information contained in the documents. In the event that a user inquiry seeks information not explicitly stated in the documents, refrain from providing an answer. Exercise precision by relying solely on the information explicitly presented in the documents; avoid making inferences, assumptions, or speculations beyond what is explicitly mentioned. User Prompt: " + text
	result = qa({"query": query})
	history.append((text, result['result']))
	resultValue = result['result']
	print(resultValue)

	#UNREDACTING THE RESULT
	for key in jsonValues:
	for value in jsonValues[key]:
	resultValue = resultValue.replace(jsonValues[key][value], value)

	return resultValue

	def extractText(file):
	#TAKING A PDF FILE AND CONVERTING IT TO A .TXT IN THE "TEXT" FOLDER
	reader = PdfReader(file)
	filename = os.path.splitext(os.path.basename(file))[0]
	text = ""
	for page in reader.pages:
	text += page.extract_text() + "\n"
	txtFile = baseFilePath + "text/" + filename + ".txt"
	#with open(txtFile, "w+") as f:
	#make utf 8
	with open(txtFile, "w+") as f:
	#f.write(re.sub(r'\s+', ' ', text))
	#write text file in utf-8 format
	f.write(text)

	#f.write(text)
	redactDocument(txtFile)
	print(data)
	with open(jsonPath, 'w') as file:
	json.dump(data, file, indent=2)

	def newFile(files, filepaths):
	count = 0
	for file in files:
	print("Processing: " + filepaths[count].split("/")[-1])
	#EXTRACTING TEXT AND PROCESSING PDF
	extractText(filepaths[count])

	redactedFile = filepaths[count].split("/")[-1].split(".")[0]

	#redactedFile = baseFilePath + "redacted/" + redactedFile + ".txt"
	redactedFile = baseFilePath + "redacted/" + redactedFile + ".txt"

	print("1")
	loader = TextLoader(redactedFile, encoding='UTF-8')
	print("2")
	documents = loader.load()
	print("3")
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=300, chunk_overlap=0, separators=[" ", ",", "\n"]
	)
	print("4")
	texts = text_splitter.split_documents(documents)
	print("5")
	print(texts)
	print("6")
	chromaDirectory = baseFilePath + "chroma_db"
	print("7")
	Chroma.from_documents(texts, embeddings, persist_directory=chromaDirectory)
	print("8")
	print("Done processing: " + filepaths[count].split("/")[-1])
	count = count + 1

	@app.route('/', methods=['GET', 'POST'])
	def chat():
	if request.method == 'POST':
	#HANDLES FILE UPLOADS
	global NER
	NER = spacy.load("en_core_web_lg")
	files = request.files.getlist('pdf-files[]')
	filenames = []
	for file in files:
	filenames.append(file.filename)
	filepaths = []
	documents_directory = baseFilePath + "documents/"
	os.makedirs(documents_directory, exist_ok=True)
	count = 0
	for file in files:
	filepath = os.path.join(documents_directory, filenames[count])
	with open(filepath, 'wb') as f:
	f.write(file.read())
	filepaths.append(filepath)
	count = count + 1
	newFile(files, filepaths)
	return "Success"
	#MAIN PAGE LOAD
	documents_directory = baseFilePath + "documents/"
	documents = os.listdir(documents_directory)
	return render_template('chat.html', history=[("", "")], documents=documents)

	@app.route('/chat', methods=['GET'])
	def askQuestion():
	#PROCESSING USER QUESTIONS
	text = request.args.get('message')
	display = question(history, text)
	return display

	@app.route('/document', methods=['GET'])
	def document():
	#RETURNS DOCUMENTS
	name = request.args.get('name')
	path = os.path.join("documents", name)
	return send_file(path)

	@app.route('/clear', methods=['GET', 'POST'])
	def clear():
	#CLEARS ALL FILES
	documents_directory = baseFilePath + "documents/"
	documents = os.listdir(documents_directory)
	for document in documents:
	os.system("rm -rf " + os.path.join(documents_directory, document))
	documents_directory = baseFilePath + "text/"
	documents = os.listdir(documents_directory)
	for document in documents:
	os.system("rm -rf " + os.path.join(documents_directory, document))
	documents_directory = baseFilePath + "redacted/"
	documents = os.listdir(documents_directory)
	for document in documents:
	os.system("rm -rf " + os.path.join(documents_directory, document))
	chroma_directory = baseFilePath + "chroma_db/"
	os.system("rm -rf " + chroma_directory)
	with open(jsonPath, 'w') as file:
	json.dump({"names": {}, "addresses": {}, "companyNames": {}, "phoneNumbers": {}, "emails": {}}, file, indent=2)
	return redirect('/')

	if __name__ == '__main__':
	app.run(debug=True)