LiamDowd commited on
Commit
bf1193e
1 Parent(s): 0e44cf4

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +37 -56
main.py CHANGED
@@ -7,9 +7,10 @@ import sys
7
  from langchain.embeddings import HuggingFaceBgeEmbeddings
8
  from langchain.embeddings import HuggingFaceEmbeddings
9
  from langchain.document_loaders import TextLoader
 
10
  from pypdf import PdfReader
11
  from langchain.text_splitter import RecursiveCharacterTextSplitter
12
- from langchain.schema.document import Document
13
  import json
14
  import re
15
  import random
@@ -17,11 +18,8 @@ import spacy
17
 
18
  app = Flask(__name__)
19
 
20
- #global redact
21
- #redact = False
22
-
23
  global isServer
24
- isServer = True
25
 
26
  global baseFilePath
27
  global jsonPath
@@ -115,13 +113,14 @@ global embeddings
115
  if isServer:
116
  embeddings = HuggingFaceEmbeddings()
117
  else:
118
- model = "BAAI/bge-base-en-v1.5"
119
- encode_kwargs = {
120
- "normalize_embeddings": True
121
- }
122
- embeddings = HuggingFaceBgeEmbeddings(
123
- model_name=model, encode_kwargs=encode_kwargs, model_kwargs={"device": "cpu"}
124
- )
 
125
 
126
  def hideOutput():
127
  sys.stdout = open(os.devnull, 'w')
@@ -131,23 +130,22 @@ def showOutput():
131
  sys.stdout = sys.__stdout__
132
  sys.stderr = sys.__stderr__
133
 
134
- def prepareOnlineLLM():
135
  #PREPARES CHROMA DB AND ACCESSES THE MIXTRAL LLM
136
  db = Chroma(persist_directory=baseFilePath + "chroma_db", embedding_function=embeddings)
137
- retriever = db.as_retriever()
138
  if isServer:
139
- llm = HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1", model_kwargs={"temperature": 0.1, "max_new_tokens": 750})
140
  else:
141
- llm = HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1", model_kwargs={"temperature": 0.1, "max_new_tokens": 750},huggingfacehub_api_token=access_token)
142
- print(retriever)
143
  global qa
144
  qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True)
145
 
146
  def question(history, text):
147
- global isFirst
148
- if isFirst:
149
- prepareOnlineLLM()
150
- isFirst = False
151
 
152
  with open(jsonPath, 'r') as file:
153
  jsonValues = json.load(file)
@@ -181,9 +179,14 @@ def extractText(file):
181
  for page in reader.pages:
182
  text += page.extract_text() + "\n"
183
  txtFile = baseFilePath + "text/" + filename + ".txt"
 
 
184
  with open(txtFile, "w+") as f:
185
  #f.write(re.sub(r'\s+', ' ', text))
 
186
  f.write(text)
 
 
187
  redactDocument(txtFile)
188
  print(data)
189
  with open(jsonPath, 'w') as file:
@@ -193,40 +196,23 @@ def newFile(files, filepaths):
193
  count = 0
194
  for file in files:
195
  print("Processing: " + filepaths[count].split("/")[-1])
196
- if filepaths[count].split(".")[-1] == "pdf":
197
- #EXTRACTING TEXT AND PROCESSING PDF
198
- extractText(filepaths[count])
199
- elif filepaths[count].split(".")[-1] == "txt":
200
- #CREATING .TXT FILE BY SAVING THE UPLOADED FILE
201
- filename = filepaths[count].split("/")[-1].split(".")[0]
202
- documentPath = baseFilePath + "documents/" + filename + ".txt"
203
- with open(documentPath, "w+") as f:
204
- textToCopy = "\n".join(f.readlines())
205
- saveFile = baseFilePath + "text/" + filename + ".txt"
206
- with open(saveFile, "w+") as f:
207
- f.write(textToCopy)
208
-
209
- redactDocument(saveFile)
210
- with open(jsonPath, 'w') as file:
211
- json.dump(data, file, indent=2)
212
- else:
213
- return "Error: File type not supported"
214
  redactedFile = filepaths[count].split("/")[-1].split(".")[0]
 
 
215
  redactedFile = baseFilePath + "redacted/" + redactedFile + ".txt"
216
- with open(redactedFile, 'r') as f:
217
- fileText = f.read()
 
218
  text_splitter = RecursiveCharacterTextSplitter(
219
- chunk_size=1000, chunk_overlap=0, separators=[" ", ",", "\n"]
220
  )
221
- texts = text_splitter.split_text(fileText)
222
- doc = Document(page_content=texts, metadata={"source": "local"})
223
- embeddings = HuggingFaceEmbeddings()
224
- #STORES TO CHROMA DB
225
- #docs = [Document(page_content=x) for x in text_splitter.split_text(fileText)]
226
- #print(docs)
227
  print(texts)
228
  chromaDirectory = baseFilePath + "chroma_db"
229
- db = Chroma.from_documents(texts, embeddings, persist_directory=chromaDirectory)
230
  print("Done processing: " + filepaths[count].split("/")[-1])
231
  count = count + 1
232
 
@@ -246,13 +232,8 @@ def chat():
246
  count = 0
247
  for file in files:
248
  filepath = os.path.join(documents_directory, filenames[count])
249
- #make it work for pdf and txt files
250
- if filepath.split(".")[-1] == "pdf":
251
- with open(filepath, 'wb') as f:
252
- f.write(file.read())
253
- elif filepath.split(".")[-1] == "txt":
254
- #CREATING .TXT FILE BY SAVING THE UPLOADED FILE
255
- print("txt")
256
  filepaths.append(filepath)
257
  count = count + 1
258
  newFile(files, filepaths)
 
7
  from langchain.embeddings import HuggingFaceBgeEmbeddings
8
  from langchain.embeddings import HuggingFaceEmbeddings
9
  from langchain.document_loaders import TextLoader
10
+ from langchain.document_loaders import OnlinePDFLoader
11
  from pypdf import PdfReader
12
  from langchain.text_splitter import RecursiveCharacterTextSplitter
13
+ from langchain.text_splitter import CharacterTextSplitter
14
  import json
15
  import re
16
  import random
 
18
 
19
  app = Flask(__name__)
20
 
 
 
 
21
  global isServer
22
+ isServer = False
23
 
24
  global baseFilePath
25
  global jsonPath
 
113
  if isServer:
114
  embeddings = HuggingFaceEmbeddings()
115
  else:
116
+ embeddings = HuggingFaceEmbeddings()
117
+ #model = "BAAI/bge-base-en-v1.5"
118
+ #encode_kwargs = {
119
+ # "normalize_embeddings": True
120
+ #}
121
+ #embeddings = HuggingFaceBgeEmbeddings(
122
+ # model_name=model, encode_kwargs=encode_kwargs, model_kwargs={"device": "cpu"}
123
+ #)
124
 
125
  def hideOutput():
126
  sys.stdout = open(os.devnull, 'w')
 
130
  sys.stdout = sys.__stdout__
131
  sys.stderr = sys.__stderr__
132
 
133
+ def prepareLLM():
134
  #PREPARES CHROMA DB AND ACCESSES THE MIXTRAL LLM
135
  db = Chroma(persist_directory=baseFilePath + "chroma_db", embedding_function=embeddings)
136
+ retriever = db.as_retriever(search_kwargs={'k': 1})
137
  if isServer:
138
+ llm = HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1", model_kwargs={"temperature": 0.1, "max_new_tokens": 700})
139
  else:
140
+ llm = HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1", model_kwargs={"temperature": 0.1, "max_new_tokens": 700},huggingfacehub_api_token=access_token)
 
141
  global qa
142
  qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True)
143
 
144
  def question(history, text):
145
+ #global isFirst
146
+ #if isFirst:
147
+ prepareLLM()
148
+ # isFirst = False
149
 
150
  with open(jsonPath, 'r') as file:
151
  jsonValues = json.load(file)
 
179
  for page in reader.pages:
180
  text += page.extract_text() + "\n"
181
  txtFile = baseFilePath + "text/" + filename + ".txt"
182
+ #with open(txtFile, "w+") as f:
183
+ #make utf 8
184
  with open(txtFile, "w+") as f:
185
  #f.write(re.sub(r'\s+', ' ', text))
186
+ #write text file in utf-8 format
187
  f.write(text)
188
+
189
+ #f.write(text)
190
  redactDocument(txtFile)
191
  print(data)
192
  with open(jsonPath, 'w') as file:
 
196
  count = 0
197
  for file in files:
198
  print("Processing: " + filepaths[count].split("/")[-1])
199
+ #EXTRACTING TEXT AND PROCESSING PDF
200
+ extractText(filepaths[count])
201
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
  redactedFile = filepaths[count].split("/")[-1].split(".")[0]
203
+
204
+ #redactedFile = baseFilePath + "redacted/" + redactedFile + ".txt"
205
  redactedFile = baseFilePath + "redacted/" + redactedFile + ".txt"
206
+
207
+ loader = TextLoader(redactedFile, encoding='UTF-8')
208
+ documents = loader.load()
209
  text_splitter = RecursiveCharacterTextSplitter(
210
+ chunk_size=300, chunk_overlap=0, separators=[" ", ",", "\n"]
211
  )
212
+ texts = text_splitter.split_documents(documents)
 
 
 
 
 
213
  print(texts)
214
  chromaDirectory = baseFilePath + "chroma_db"
215
+ Chroma.from_documents(texts, embeddings, persist_directory=chromaDirectory)
216
  print("Done processing: " + filepaths[count].split("/")[-1])
217
  count = count + 1
218
 
 
232
  count = 0
233
  for file in files:
234
  filepath = os.path.join(documents_directory, filenames[count])
235
+ with open(filepath, 'wb') as f:
236
+ f.write(file.read())
 
 
 
 
 
237
  filepaths.append(filepath)
238
  count = count + 1
239
  newFile(files, filepaths)