YchKhan commited on
Commit
e1c1593
1 Parent(s): aea8774

Update split_files_to_excel.py

Browse files
Files changed (1) hide show
  1. split_files_to_excel.py +20 -20
split_files_to_excel.py CHANGED
@@ -475,27 +475,27 @@ def split_doc_in_chunks(input_folder, base_folders, nb_pages):
475
  # Select the appropriate document loader
476
  chunks=[]
477
  if path.endswith(".pdf"):
478
- try:
479
- print("Treatment of pdf file", path)
480
- raw_chunks = split_pdf(path, input_folder)
481
- for j, raw_chunk in enumerate(raw_chunks):
482
- print(f"BASE zzzzz LIST : {base_folders} = i = {j}")
483
- raw_chunk.metadata["Base Folder"] = base_folders[j]
484
- sb_chunks = group_chunks_by_section(raw_chunks)
485
- if nb_pages > 0:
486
- for sb_chunk in sb_chunks:
487
- print(f"CHUNK PAGENUM = {sb_chunk.metadata['page_number']}")
488
- if int(sb_chunk.metadata["page_number"])<nb_pages:
489
- chunks.append(sb_chunk)
490
- else:
491
- break
492
  else:
493
- chunks = sb_chunks
494
- print(f"Document splitted in {len(chunks)} chunks")
495
- # for chunk in chunks:
496
- # print(f"\n\n____\n\n\nPDF CONTENT: \n{chunk.page_content}\ntitle: {chunk.metadata['title']}\nFile Name: {chunk.metadata['filename']}\n\n")
497
- except Exception as e:
498
- print("Error while splitting the pdf file: ", e)
 
 
499
  elif path.endswith(".docx"):
500
  try:
501
  print ("Treatment of docx file", path)
 
475
  # Select the appropriate document loader
476
  chunks=[]
477
  if path.endswith(".pdf"):
478
+ # try:
479
+ print("Treatment of pdf file", path)
480
+ raw_chunks = split_pdf(path, input_folder)
481
+ for j, raw_chunk in enumerate(raw_chunks):
482
+ print(f"BASE zzzzz LIST : {base_folders} = i = {j}")
483
+ raw_chunk.metadata["Base Folder"] = base_folders[j]
484
+ sb_chunks = group_chunks_by_section(raw_chunks)
485
+ if nb_pages > 0:
486
+ for sb_chunk in sb_chunks:
487
+ print(f"CHUNK PAGENUM = {sb_chunk.metadata['page_number']}")
488
+ if int(sb_chunk.metadata["page_number"])<nb_pages:
489
+ chunks.append(sb_chunk)
 
 
490
  else:
491
+ break
492
+ else:
493
+ chunks = sb_chunks
494
+ print(f"Document splitted in {len(chunks)} chunks")
495
+ # for chunk in chunks:
496
+ # print(f"\n\n____\n\n\nPDF CONTENT: \n{chunk.page_content}\ntitle: {chunk.metadata['title']}\nFile Name: {chunk.metadata['filename']}\n\n")
497
+ # except Exception as e:
498
+ # print("Error while splitting the pdf file: ", e)
499
  elif path.endswith(".docx"):
500
  try:
501
  print ("Treatment of docx file", path)