YchKhan commited on
Commit
ae9b962
1 Parent(s): 6c4fd41

Update split_files_to_excel.py

Browse files
Files changed (1) hide show
  1. split_files_to_excel.py +14 -6
split_files_to_excel.py CHANGED
@@ -68,7 +68,7 @@ text_splitter = CharacterTextSplitter(
68
 
69
  def function_split_call(fi_input, dropdown, choice, chunk_size):
70
  if choice == "Intelligent split":
71
- return split_in_df(fi_input)
72
  elif choice == "Non intelligent split":
73
  return non_intelligent_split(fi_input, chunk_size)
74
  else:
@@ -78,7 +78,7 @@ def change_textbox(dropdown,radio):
78
  if len(dropdown) == 0 :
79
  dropdown = ["introduction", "objective", "summary", "conclusion"]
80
  if radio == "Intelligent split by keywords":
81
- return gr.Dropdown(dropdown, multiselect=True, visible=True, allow_custom_value=True), gr.Number(visible=False)
82
  elif radio == "Non intelligent split":
83
  return gr.Dropdown(dropdown, visible=False),gr.Number(label="Chunk size", value=1000, interactive=True, visible=True)
84
  else:
@@ -464,7 +464,7 @@ def split_chunks_by_tokens_period(documents, max_length=170, overlap=10, min_chu
464
 
465
  # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
466
 
467
- def split_doc_in_chunks(input_folder, base_folders):
468
  docs = []
469
  for i, filename in enumerate(input_folder):
470
  path = filename#os.path.join(input_folder, filename)
@@ -478,7 +478,15 @@ def split_doc_in_chunks(input_folder, base_folders):
478
  for raw_chunk in raw_chunks:
479
  print(f"BASE zzzzz LIST : {base_folders} = i = {i}")
480
  raw_chunk.metadata["Base Folder"] = base_folders[i]
481
- chunks = group_chunks_by_section(raw_chunks)
 
 
 
 
 
 
 
 
482
  print(f"Document splitted in {len(chunks)} chunks")
483
  # for chunk in chunks:
484
  # print(f"\n\n____\n\n\nPDF CONTENT: \n{chunk.page_content}\ntitle: {chunk.metadata['title']}\nFile Name: {chunk.metadata['filename']}\n\n")
@@ -579,7 +587,7 @@ def extract_zip(zip_path):
579
  zip_ref.extract(file_info.filename)
580
  return extracted_files
581
 
582
- def split_in_df(files):
583
  processed_files = []
584
  base_folders = []
585
  print("Processing zip files...")
@@ -593,7 +601,7 @@ def split_in_df(files):
593
  base_folders.append("")
594
  print(f"BASE FOLDERS LIST : {base_folders}")
595
  print("Finished processing zip files\nSplitting files into chunks...")
596
- documents = split_doc_in_chunks(processed_files, base_folders)
597
  re_docs = resplit_by_end_of_sentence(documents, 1000, 100, 1500)
598
  print("Finished splitting")
599
  df = pd.DataFrame()
 
68
 
69
  def function_split_call(fi_input, dropdown, choice, chunk_size):
70
  if choice == "Intelligent split":
71
+ return split_in_df(fi_input, nb_pages)
72
  elif choice == "Non intelligent split":
73
  return non_intelligent_split(fi_input, chunk_size)
74
  else:
 
78
  if len(dropdown) == 0 :
79
  dropdown = ["introduction", "objective", "summary", "conclusion"]
80
  if radio == "Intelligent split by keywords":
81
+ return gr.Dropdown(dropdown, multiselect=True, visible=True, allow_custom_value=True), gr.Number(label="First pages to keep (0 for all)", value=2, interactive=True, visible=True)
82
  elif radio == "Non intelligent split":
83
  return gr.Dropdown(dropdown, visible=False),gr.Number(label="Chunk size", value=1000, interactive=True, visible=True)
84
  else:
 
464
 
465
  # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
466
 
467
+ def split_doc_in_chunks(input_folder, base_folders, nb_pages):
468
  docs = []
469
  for i, filename in enumerate(input_folder):
470
  path = filename#os.path.join(input_folder, filename)
 
478
  for raw_chunk in raw_chunks:
479
  print(f"BASE zzzzz LIST : {base_folders} = i = {i}")
480
  raw_chunk.metadata["Base Folder"] = base_folders[i]
481
+ sb_chunks = group_chunks_by_section(raw_chunks)
482
+ if nb_pages > 0:
483
+ for sb_chunk in sb_chunks:
484
+ if int(sb_chunk.metadata["page_number"])<nb_pages:
485
+ chunks.append(sb_chunk)
486
+ else:
487
+ break
488
+ else:
489
+ chunks = sb_chunks
490
  print(f"Document splitted in {len(chunks)} chunks")
491
  # for chunk in chunks:
492
  # print(f"\n\n____\n\n\nPDF CONTENT: \n{chunk.page_content}\ntitle: {chunk.metadata['title']}\nFile Name: {chunk.metadata['filename']}\n\n")
 
587
  zip_ref.extract(file_info.filename)
588
  return extracted_files
589
 
590
+ def split_in_df(files, nb_pages):
591
  processed_files = []
592
  base_folders = []
593
  print("Processing zip files...")
 
601
  base_folders.append("")
602
  print(f"BASE FOLDERS LIST : {base_folders}")
603
  print("Finished processing zip files\nSplitting files into chunks...")
604
+ documents = split_doc_in_chunks(processed_files, base_folders, nb_pages)
605
  re_docs = resplit_by_end_of_sentence(documents, 1000, 100, 1500)
606
  print("Finished splitting")
607
  df = pd.DataFrame()