YchKhan commited on
Commit
b012677
1 Parent(s): 1a7b560

Update split_files_to_excel.py

Browse files
Files changed (1) hide show
  1. split_files_to_excel.py +16 -6
split_files_to_excel.py CHANGED
@@ -493,9 +493,9 @@ def split_doc_in_chunks(input_folder):
493
  return docs
494
 
495
  # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
496
- def resplit_by_end_of_sentence(docs):
497
  print("❌❌\nResplitting docs by end of sentence\n❌❌")
498
- resized_docs = split_chunks_by_tokens_period(docs, max_length=200, overlap=40, min_chunk_size=20)
499
  try:
500
  # add chunk title to all resplitted chunks #todo move this to split_chunks_by_tokens_period(inject_title = True) with a boolean parameter
501
  cur_source = ""
@@ -553,11 +553,12 @@ def split_in_df(files):
553
  processed_files.append(file_path)
554
  print("Finished processing zip files\Splitting files into chunks...")
555
  documents = split_doc_in_chunks(processed_files)
 
556
  print("Finished splitting")
557
  df = pd.DataFrame()
558
- for document in documents:
559
- filename = document.metadata['filename']
560
- content = document.page_content
561
 
562
  # metadata = document.metadata
563
  # metadata_keys = list(metadata.keys())
@@ -836,4 +837,13 @@ def non_intelligent_split(files, chunk_size = 1000):
836
 
837
  df.to_excel("dataframe_keywords.xlsx", index=False)
838
 
839
- return "dataframe_keywords.xlsx"
 
 
 
 
 
 
 
 
 
 
493
  return docs
494
 
495
  # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
496
+ def resplit_by_end_of_sentence(docs, max_len, overlap, min_len):
497
  print("❌❌\nResplitting docs by end of sentence\n❌❌")
498
+ resized_docs = split_chunks_by_tokens_period(docs, max_len, overlap, min_len)
499
  try:
500
  # add chunk title to all resplitted chunks #todo move this to split_chunks_by_tokens_period(inject_title = True) with a boolean parameter
501
  cur_source = ""
 
553
  processed_files.append(file_path)
554
  print("Finished processing zip files\Splitting files into chunks...")
555
  documents = split_doc_in_chunks(processed_files)
556
+ re_docs = resplit_by_end_of_sentence(documents, 1000, 100, 1500)
557
  print("Finished splitting")
558
  df = pd.DataFrame()
559
+ for re_doc in re_docs:
560
+ filename = re_doc.metadata['filename']
561
+ content = re_doc.page_content
562
 
563
  # metadata = document.metadata
564
  # metadata_keys = list(metadata.keys())
 
837
 
838
  df.to_excel("dataframe_keywords.xlsx", index=False)
839
 
840
+ return "dataframe_keywords.xlsx"
841
+
842
+
843
+ def function_split_call(fi_input, dropdown, choice, chunk_size):
844
+ if choice == "Intelligent split":
845
+ return split_in_df(fi_input)
846
+ elif choice == "Non intelligent split":
847
+ return non_intelligent_split(fi_input, chunk_size)
848
+ else:
849
+ return split_by_keywords(fi_input,dropdown)