YchKhan commited on
Commit
62ab562
1 Parent(s): c128c5d

Update split_files_to_excel.py

Browse files
Files changed (1) hide show
  1. split_files_to_excel.py +43 -4
split_files_to_excel.py CHANGED
@@ -243,7 +243,7 @@ def group_chunks_by_section(chunks, min_chunk_size=64):
243
  # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
244
  ## Split documents by font
245
 
246
- def split_pdf(file_path, folder):
247
  loader = PDFMinerPDFasHTMLLoader(file_path)
248
 
249
  data = loader.load()[0] # entire pdf is loaded as a single Document
@@ -259,7 +259,7 @@ def split_pdf(file_path, folder):
259
  return chunks
260
 
261
  # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
262
- def split_docx(file_path, folder):
263
  chunks_elms = partition_docx(filename=file_path)
264
  chunks = []
265
  file_categories = file_path.split("/")
@@ -282,6 +282,36 @@ def split_docx(file_path, folder):
282
  chunks.append(chunk)
283
  return chunks
284
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
  # Load the index of documents (if it has already been built)
286
 
287
  def rebuild_index(input_folder, output_folder):
@@ -477,7 +507,7 @@ def split_doc_in_chunks(input_folder, base_folders, nb_pages):
477
  if path.endswith(".pdf"):
478
  # try:
479
  print("Treatment of pdf file", path)
480
- raw_chunks = split_pdf(path, input_folder)
481
  for raw_chunk in raw_chunks:
482
  print(f"BASE zzzzz LIST : {base_folders} = i = {i}")
483
  raw_chunk.metadata["Base Folder"] = base_folders[i]
@@ -499,7 +529,7 @@ def split_doc_in_chunks(input_folder, base_folders, nb_pages):
499
  elif path.endswith(".docx"):
500
  try:
501
  print ("Treatment of docx file", path)
502
- raw_chunks = split_docx(path, input_folder)
503
  for raw_chunk in raw_chunks:
504
  raw_chunk.metadata["Base Folder"] = base_folders[i]
505
  #print(f"RAW :\n***\n{raw_chunks}")
@@ -534,6 +564,15 @@ def split_doc_in_chunks(input_folder, base_folders, nb_pages):
534
  #The file type is not supported (e.g. .xlsx)
535
  except Exception as e:
536
  print(f"An error occurred: {e}")
 
 
 
 
 
 
 
 
 
537
  try:
538
  if len(chunks)>0:
539
  docs += chunks
 
243
  # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
244
  ## Split documents by font
245
 
246
+ def split_pdf(file_path):
247
  loader = PDFMinerPDFasHTMLLoader(file_path)
248
 
249
  data = loader.load()[0] # entire pdf is loaded as a single Document
 
259
  return chunks
260
 
261
  # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
262
+ def split_docx(file_path):
263
  chunks_elms = partition_docx(filename=file_path)
264
  chunks = []
265
  file_categories = file_path.split("/")
 
282
  chunks.append(chunk)
283
  return chunks
284
 
285
+
286
+ def split_txt(file_path, chunk_size=700):
287
+ with open(file_path, 'r') as file:
288
+ content = file.read()
289
+ words = content.split()
290
+ chunks = [words[i:i + chunk_size] for i in range(0, len(words), chunk_size)]
291
+
292
+ file_basename = os.path.basename(file_path)
293
+ file_directory = os.path.dirname(file_path)
294
+ source = file_path
295
+
296
+ documents = []
297
+ for i, chunk in enumerate(chunks):
298
+ tcontent = ' '.join(chunk)
299
+ metadata = {
300
+ 'source': source,
301
+ "filename": file_basename,
302
+ 'file_directory': file_directory,
303
+ "file_category": "",
304
+ "file_sub-cat": "",
305
+ "file_sub2-cat": "",
306
+ "category": "",
307
+ "filetype": source.split(".")[-1],
308
+ "page_number": i
309
+ }
310
+ document = Document(tcontent, metadata)
311
+ documents.append(document)
312
+
313
+ return documents
314
+
315
  # Load the index of documents (if it has already been built)
316
 
317
  def rebuild_index(input_folder, output_folder):
 
507
  if path.endswith(".pdf"):
508
  # try:
509
  print("Treatment of pdf file", path)
510
+ raw_chunks = split_pdf(path)
511
  for raw_chunk in raw_chunks:
512
  print(f"BASE zzzzz LIST : {base_folders} = i = {i}")
513
  raw_chunk.metadata["Base Folder"] = base_folders[i]
 
529
  elif path.endswith(".docx"):
530
  try:
531
  print ("Treatment of docx file", path)
532
+ raw_chunks = split_docx(path)
533
  for raw_chunk in raw_chunks:
534
  raw_chunk.metadata["Base Folder"] = base_folders[i]
535
  #print(f"RAW :\n***\n{raw_chunks}")
 
564
  #The file type is not supported (e.g. .xlsx)
565
  except Exception as e:
566
  print(f"An error occurred: {e}")
567
+ elif path.endswith(".txt"):
568
+ try:
569
+ print ("Treatment of txt file", path)
570
+ chunks = split_txt(path)
571
+ for chunk in chunks:
572
+ chunk.metadata["Base Folder"] = base_folders[i]
573
+ print(f"Document splitted in {len(chunks)} chunks")
574
+ except Exception as e:
575
+ print("Error while splitting the docx file: ", e)
576
  try:
577
  if len(chunks)>0:
578
  docs += chunks