Standard_Intelligence_Dev

Sleeping

App Files Files Community

YchKhan commited on Jun 21, 2024

Commit

62ab562

verified ·

1 Parent(s): c128c5d

Update split_files_to_excel.py

Browse files

Files changed (1) hide show

split_files_to_excel.py +43 -4

split_files_to_excel.py CHANGED Viewed

@@ -243,7 +243,7 @@ def group_chunks_by_section(chunks, min_chunk_size=64):
 # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
 ## Split documents by font
-def split_pdf(file_path, folder):
     loader = PDFMinerPDFasHTMLLoader(file_path)
     data = loader.load()[0]   # entire pdf is loaded as a single Document
@@ -259,7 +259,7 @@ def split_pdf(file_path, folder):
     return chunks
 # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
-def split_docx(file_path, folder):
     chunks_elms = partition_docx(filename=file_path)
     chunks = []
     file_categories = file_path.split("/")
@@ -282,6 +282,36 @@ def split_docx(file_path, folder):
         chunks.append(chunk)
     return chunks
 # Load the index of documents (if it has already been built)
 def rebuild_index(input_folder, output_folder):
@@ -477,7 +507,7 @@ def split_doc_in_chunks(input_folder, base_folders, nb_pages):
         if path.endswith(".pdf"):
             # try:
             print("Treatment of pdf file", path)
-            raw_chunks = split_pdf(path, input_folder)
             for raw_chunk in raw_chunks:
                 print(f"BASE zzzzz LIST : {base_folders} = i = {i}")
                 raw_chunk.metadata["Base Folder"] = base_folders[i]
@@ -499,7 +529,7 @@ def split_doc_in_chunks(input_folder, base_folders, nb_pages):
         elif path.endswith(".docx"):
             try:
                 print ("Treatment of docx file", path)
-                raw_chunks = split_docx(path, input_folder)
                 for raw_chunk in raw_chunks:
                     raw_chunk.metadata["Base Folder"] = base_folders[i]
                 #print(f"RAW :\n***\n{raw_chunks}")
@@ -534,6 +564,15 @@ def split_doc_in_chunks(input_folder, base_folders, nb_pages):
             #The file type is not supported (e.g. .xlsx)
             except Exception as e:
                 print(f"An error occurred: {e}")
         try:
             if len(chunks)>0:
                 docs += chunks

 # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
 ## Split documents by font
+def split_pdf(file_path):
     loader = PDFMinerPDFasHTMLLoader(file_path)
     data = loader.load()[0]   # entire pdf is loaded as a single Document
     return chunks
 # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
+def split_docx(file_path):
     chunks_elms = partition_docx(filename=file_path)
     chunks = []
     file_categories = file_path.split("/")
         chunks.append(chunk)
     return chunks
+def split_txt(file_path, chunk_size=700):
+    with open(file_path, 'r') as file:
+        content = file.read()
+        words = content.split()
+        chunks = [words[i:i + chunk_size] for i in range(0, len(words), chunk_size)]
+        file_basename = os.path.basename(file_path)
+        file_directory = os.path.dirname(file_path)
+        source = file_path
+        documents = []
+        for i, chunk in enumerate(chunks):
+            tcontent = ' '.join(chunk)
+            metadata = {
+                'source': source,
+                "filename": file_basename,
+                'file_directory': file_directory,
+                "file_category": "",
+                "file_sub-cat": "",
+                "file_sub2-cat": "",
+                "category": "",
+                "filetype": source.split(".")[-1],
+                "page_number": i
+            }
+            document = Document(tcontent, metadata)
+            documents.append(document)
+        return documents
 # Load the index of documents (if it has already been built)
 def rebuild_index(input_folder, output_folder):
         if path.endswith(".pdf"):
             # try:
             print("Treatment of pdf file", path)
+            raw_chunks = split_pdf(path)
             for raw_chunk in raw_chunks:
                 print(f"BASE zzzzz LIST : {base_folders} = i = {i}")
                 raw_chunk.metadata["Base Folder"] = base_folders[i]
         elif path.endswith(".docx"):
             try:
                 print ("Treatment of docx file", path)
+                raw_chunks = split_docx(path)
                 for raw_chunk in raw_chunks:
                     raw_chunk.metadata["Base Folder"] = base_folders[i]
                 #print(f"RAW :\n***\n{raw_chunks}")
             #The file type is not supported (e.g. .xlsx)
             except Exception as e:
                 print(f"An error occurred: {e}")
+        elif path.endswith(".txt"):
+            try:
+                print ("Treatment of txt file", path)
+                chunks = split_txt(path)
+                for chunk in chunks:
+                    chunk.metadata["Base Folder"] = base_folders[i]
+                print(f"Document splitted in {len(chunks)} chunks")
+            except Exception as e:
+                print("Error while splitting the docx file: ", e)
         try:
             if len(chunks)>0:
                 docs += chunks