YchKhan commited on
Commit
b5d29e3
1 Parent(s): ecf9456

Update split_files_to_excel.py

Browse files
Files changed (1) hide show
  1. split_files_to_excel.py +6 -5
split_files_to_excel.py CHANGED
@@ -471,19 +471,20 @@ def split_doc_in_chunks(input_folder, base_folders, nb_pages):
471
  docs = []
472
  for i, filename in enumerate(input_folder):
473
  path = filename#os.path.join(input_folder, filename)
474
- print(f"Treating file {i}/{len(input_folder)}")
475
  # Select the appropriate document loader
476
  chunks=[]
477
  if path.endswith(".pdf"):
478
  try:
479
  print("Treatment of pdf file", path)
480
  raw_chunks = split_pdf(path, input_folder)
481
- for raw_chunk in raw_chunks:
482
- print(f"BASE zzzzz LIST : {base_folders} = i = {i}")
483
- raw_chunk.metadata["Base Folder"] = base_folders[i]
484
  sb_chunks = group_chunks_by_section(raw_chunks)
485
  if nb_pages > 0:
486
  for sb_chunk in sb_chunks:
 
487
  if int(sb_chunk.metadata["page_number"])<nb_pages:
488
  chunks.append(sb_chunk)
489
  else:
@@ -602,7 +603,7 @@ def split_in_df(files, nb_pages):
602
  else:
603
  processed_files.append(file_path)
604
  base_folders.append("")
605
- print(f"BASE FOLDERS LIST : {base_folders}")
606
  print("Finished processing zip files\nSplitting files into chunks...")
607
  documents = split_doc_in_chunks(processed_files, base_folders, nb_pages)
608
  re_docs = resplit_by_end_of_sentence(documents, 1000, 100, 1500)
 
471
  docs = []
472
  for i, filename in enumerate(input_folder):
473
  path = filename#os.path.join(input_folder, filename)
474
+ print(f"Treating file {i+1}/{len(input_folder)}")
475
  # Select the appropriate document loader
476
  chunks=[]
477
  if path.endswith(".pdf"):
478
  try:
479
  print("Treatment of pdf file", path)
480
  raw_chunks = split_pdf(path, input_folder)
481
+ for j, raw_chunk in enumerate(raw_chunks):
482
+ print(f"BASE zzzzz LIST : {base_folders} = i = {j}")
483
+ raw_chunk.metadata["Base Folder"] = base_folders[j]
484
  sb_chunks = group_chunks_by_section(raw_chunks)
485
  if nb_pages > 0:
486
  for sb_chunk in sb_chunks:
487
+ print(f"CHUNK PAGENUM = {sb_chunk.metadata['page_number']}")
488
  if int(sb_chunk.metadata["page_number"])<nb_pages:
489
  chunks.append(sb_chunk)
490
  else:
 
603
  else:
604
  processed_files.append(file_path)
605
  base_folders.append("")
606
+ print(f"BASE FOLDERS LIST : {base_folders}, FILES LIST : {processed_files}")
607
  print("Finished processing zip files\nSplitting files into chunks...")
608
  documents = split_doc_in_chunks(processed_files, base_folders, nb_pages)
609
  re_docs = resplit_by_end_of_sentence(documents, 1000, 100, 1500)