Spaces:

dteam
/

chatgpt-dteam

Running

AllenYkl commited on Mar 31, 2023

Commit

686f883

•

1 Parent(s): a429dbf

Update bin_public/app/llama_func.py

Files changed (1) hide show

bin_public/app/llama_func.py CHANGED Viewed

@@ -15,34 +15,38 @@ from bin_public.utils.utils import *
 def get_documents(file_src):
     documents = []
-    index_name = ""
     logging.debug("Loading documents...")
     logging.debug(f"file_src: {file_src}")
     for file in file_src:
-        logging.debug(f"file: {file.name}")
-        index_name += file.name
         if os.path.splitext(file.name)[1] == ".pdf":
             logging.debug("Loading PDF...")
-            CJKPDFReader = download_loader("CJKPDFReader")
-            loader = CJKPDFReader()
-            documents += loader.load_data(file=file.name)
         elif os.path.splitext(file.name)[1] == ".docx":
             logging.debug("Loading DOCX...")
             DocxReader = download_loader("DocxReader")
             loader = DocxReader()
-            documents += loader.load_data(file=file.name)
         elif os.path.splitext(file.name)[1] == ".epub":
             logging.debug("Loading EPUB...")
             EpubReader = download_loader("EpubReader")
             loader = EpubReader()
-            documents += loader.load_data(file=file.name)
         else:
             logging.debug("Loading text file...")
             with open(file.name, "r", encoding="utf-8") as f:
-                text = add_space(f.read())
-                documents += [Document(text)]
-    index_name = sha1sum(index_name)
-    return documents, index_name
 def construct_index(

 def get_documents(file_src):
     documents = []
     logging.debug("Loading documents...")
     logging.debug(f"file_src: {file_src}")
     for file in file_src:
+        logging.info(f"loading file: {file.name}")
         if os.path.splitext(file.name)[1] == ".pdf":
             logging.debug("Loading PDF...")
+            pdftext = ""
+            with open(file.name, 'rb') as pdfFileObj:
+                pdfReader = PyPDF2.PdfReader(pdfFileObj)
+                for page in tqdm(pdfReader.pages):
+                    pdftext += page.extract_text()
+            text_raw = pdftext
         elif os.path.splitext(file.name)[1] == ".docx":
             logging.debug("Loading DOCX...")
             DocxReader = download_loader("DocxReader")
             loader = DocxReader()
+            text_raw = loader.load_data(file=file.name)[0].text
         elif os.path.splitext(file.name)[1] == ".epub":
             logging.debug("Loading EPUB...")
             EpubReader = download_loader("EpubReader")
             loader = EpubReader()
+            text_raw = loader.load_data(file=file.name)[0].text
         else:
             logging.debug("Loading text file...")
             with open(file.name, "r", encoding="utf-8") as f:
+                text_raw = f.read()
+        text = add_space(text_raw)
+        # text = block_split(text)
+        # documents += text
+        documents += [Document(text)]
+    logging.debug("Documents loaded.")
+    return documents
 def construct_index(