Spaces:

LiamDowd
/

Redact

Sleeping

App Files Files Community

LiamDowd commited on Jan 4

Commit

bf1193e

•

1 Parent(s): 0e44cf4

Update main.py

Browse files

Files changed (1) hide show

main.py +37 -56

main.py CHANGED Viewed

@@ -7,9 +7,10 @@ import sys
 from langchain.embeddings import HuggingFaceBgeEmbeddings
 from langchain.embeddings import HuggingFaceEmbeddings
 from langchain.document_loaders import TextLoader
 from pypdf import PdfReader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain.schema.document import Document
 import json
 import re
 import random
@@ -17,11 +18,8 @@ import spacy
 app = Flask(__name__)
-#global redact
-#redact = False
 global isServer
-isServer = True
 global baseFilePath
 global jsonPath
@@ -115,13 +113,14 @@ global embeddings
 if isServer:
     embeddings = HuggingFaceEmbeddings()
 else:
-    model = "BAAI/bge-base-en-v1.5"
-    encode_kwargs = {
-        "normalize_embeddings": True
-    }
-    embeddings = HuggingFaceBgeEmbeddings(
-        model_name=model, encode_kwargs=encode_kwargs, model_kwargs={"device": "cpu"}
-    )
 def hideOutput():
     sys.stdout = open(os.devnull, 'w')
@@ -131,23 +130,22 @@ def showOutput():
     sys.stdout = sys.__stdout__
     sys.stderr = sys.__stderr__
-def prepareOnlineLLM():
     #PREPARES CHROMA DB AND ACCESSES THE MIXTRAL LLM
     db = Chroma(persist_directory=baseFilePath + "chroma_db", embedding_function=embeddings)
-    retriever = db.as_retriever()
     if isServer:
-        llm = HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1", model_kwargs={"temperature": 0.1, "max_new_tokens": 750})
     else:
-        llm = HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1", model_kwargs={"temperature": 0.1, "max_new_tokens": 750},huggingfacehub_api_token=access_token)
-    print(retriever)
     global qa
     qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True)
 def question(history, text):
-    global isFirst
-    if isFirst:
-        prepareOnlineLLM()
-        isFirst = False
     with open(jsonPath, 'r') as file:
         jsonValues = json.load(file)
@@ -181,9 +179,14 @@ def extractText(file):
     for page in reader.pages:
         text += page.extract_text() + "\n"
     txtFile = baseFilePath + "text/" + filename + ".txt"
     with open(txtFile, "w+") as f:
         #f.write(re.sub(r'\s+', ' ', text))
         f.write(text)
     redactDocument(txtFile)
     print(data)
     with open(jsonPath, 'w') as file:
@@ -193,40 +196,23 @@ def newFile(files, filepaths):
     count = 0
     for file in files:
         print("Processing: " + filepaths[count].split("/")[-1])
-        if filepaths[count].split(".")[-1] == "pdf":
-            #EXTRACTING TEXT AND PROCESSING PDF
-            extractText(filepaths[count])
-        elif filepaths[count].split(".")[-1] == "txt":
-            #CREATING .TXT FILE BY SAVING THE UPLOADED FILE
-            filename = filepaths[count].split("/")[-1].split(".")[0]
-            documentPath = baseFilePath + "documents/" + filename + ".txt"
-            with open(documentPath, "w+") as f:
-                textToCopy = "\n".join(f.readlines())
-            saveFile = baseFilePath + "text/" + filename + ".txt"
-            with open(saveFile, "w+") as f:
-                f.write(textToCopy)
-            redactDocument(saveFile)
-            with open(jsonPath, 'w') as file:
-                json.dump(data, file, indent=2)
-        else:
-            return "Error: File type not supported"
         redactedFile = filepaths[count].split("/")[-1].split(".")[0]
         redactedFile = baseFilePath + "redacted/" + redactedFile + ".txt"
-        with open(redactedFile, 'r') as f:
-            fileText = f.read()
         text_splitter = RecursiveCharacterTextSplitter(
-            chunk_size=1000, chunk_overlap=0, separators=[" ", ",", "\n"]
         )
-        texts = text_splitter.split_text(fileText)
-        doc =  Document(page_content=texts, metadata={"source": "local"})
-        embeddings = HuggingFaceEmbeddings()
-        #STORES TO CHROMA DB
-        #docs = [Document(page_content=x) for x in text_splitter.split_text(fileText)]
-        #print(docs)
         print(texts)
         chromaDirectory = baseFilePath + "chroma_db"
-        db = Chroma.from_documents(texts, embeddings, persist_directory=chromaDirectory)
         print("Done processing: " + filepaths[count].split("/")[-1])
         count = count + 1
@@ -246,13 +232,8 @@ def chat():
         count = 0
         for file in files:
             filepath = os.path.join(documents_directory, filenames[count])
-            #make it work for pdf and txt files
-            if filepath.split(".")[-1] == "pdf":
-                with open(filepath, 'wb') as f:
-                    f.write(file.read())
-            elif filepath.split(".")[-1] == "txt":
-                #CREATING .TXT FILE BY SAVING THE UPLOADED FILE
-                print("txt")
             filepaths.append(filepath)
             count = count + 1
         newFile(files, filepaths)

 from langchain.embeddings import HuggingFaceBgeEmbeddings
 from langchain.embeddings import HuggingFaceEmbeddings
 from langchain.document_loaders import TextLoader
+from langchain.document_loaders import OnlinePDFLoader
 from pypdf import PdfReader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.text_splitter import CharacterTextSplitter
 import json
 import re
 import random
 app = Flask(__name__)
 global isServer
+isServer = False
 global baseFilePath
 global jsonPath
 if isServer:
     embeddings = HuggingFaceEmbeddings()
 else:
+    embeddings = HuggingFaceEmbeddings()
+    #model = "BAAI/bge-base-en-v1.5"
+    #encode_kwargs = {
+    #    "normalize_embeddings": True
+    #}
+    #embeddings = HuggingFaceBgeEmbeddings(
+    #    model_name=model, encode_kwargs=encode_kwargs, model_kwargs={"device": "cpu"}
+    #)
 def hideOutput():
     sys.stdout = open(os.devnull, 'w')
     sys.stdout = sys.__stdout__
     sys.stderr = sys.__stderr__
+def prepareLLM():
     #PREPARES CHROMA DB AND ACCESSES THE MIXTRAL LLM
     db = Chroma(persist_directory=baseFilePath + "chroma_db", embedding_function=embeddings)
+    retriever = db.as_retriever(search_kwargs={'k': 1})
     if isServer:
+        llm = HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1", model_kwargs={"temperature": 0.1, "max_new_tokens": 700})
     else:
+        llm = HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1", model_kwargs={"temperature": 0.1, "max_new_tokens": 700},huggingfacehub_api_token=access_token)
     global qa
     qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True)
 def question(history, text):
+    #global isFirst
+    #if isFirst:
+    prepareLLM()
+    #    isFirst = False
     with open(jsonPath, 'r') as file:
         jsonValues = json.load(file)
     for page in reader.pages:
         text += page.extract_text() + "\n"
     txtFile = baseFilePath + "text/" + filename + ".txt"
+    #with open(txtFile, "w+") as f:
+    #make utf 8
     with open(txtFile, "w+") as f:
         #f.write(re.sub(r'\s+', ' ', text))
+        #write text file in utf-8 format
         f.write(text)
+        #f.write(text)
     redactDocument(txtFile)
     print(data)
     with open(jsonPath, 'w') as file:
     count = 0
     for file in files:
         print("Processing: " + filepaths[count].split("/")[-1])
+        #EXTRACTING TEXT AND PROCESSING PDF
+        extractText(filepaths[count])
         redactedFile = filepaths[count].split("/")[-1].split(".")[0]
+        #redactedFile = baseFilePath + "redacted/" + redactedFile + ".txt"
         redactedFile = baseFilePath + "redacted/" + redactedFile + ".txt"
+        loader = TextLoader(redactedFile, encoding='UTF-8')
+        documents = loader.load()
         text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=300, chunk_overlap=0, separators=[" ", ",", "\n"]
         )
+        texts = text_splitter.split_documents(documents)
         print(texts)
         chromaDirectory = baseFilePath + "chroma_db"
+        Chroma.from_documents(texts, embeddings, persist_directory=chromaDirectory)
         print("Done processing: " + filepaths[count].split("/")[-1])
         count = count + 1
         count = 0
         for file in files:
             filepath = os.path.join(documents_directory, filenames[count])
+            with open(filepath, 'wb') as f:
+                f.write(file.read())
             filepaths.append(filepath)
             count = count + 1
         newFile(files, filepaths)