YchKhan commited on
Commit
5fb1f69
1 Parent(s): 9767141

Update split_files_to_excel.py

Browse files
Files changed (1) hide show
  1. split_files_to_excel.py +2 -2
split_files_to_excel.py CHANGED
@@ -181,7 +181,7 @@ def create_documents(source, snippets, font_sizes):
181
  ## Group Chunks docx or pdf
182
 
183
  # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
184
- def group_chunks_by_section(chunks, min_chunk_size=512):
185
  filtered_chunks = [chunk for chunk in chunks if chunk.metadata['category'] != 'PageBreak']# Add more filters if needed
186
  #print(f"filtered = {len(filtered_chunks)} - before = {len(chunks)}")
187
  new_chunks = []
@@ -580,7 +580,7 @@ def split_in_df(files):
580
  if file_path.endswith('.zip'):
581
  extracted_files = extract_zip(file_path)
582
  processed_files.extend(extracted_files)
583
- base_folders.append(os.path.splitext(os.path.basename(file_path))[0])
584
  else:
585
  processed_files.append(file_path)
586
  base_folders.append("")
 
181
  ## Group Chunks docx or pdf
182
 
183
  # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
184
+ def group_chunks_by_section(chunks, min_chunk_size=64):
185
  filtered_chunks = [chunk for chunk in chunks if chunk.metadata['category'] != 'PageBreak']# Add more filters if needed
186
  #print(f"filtered = {len(filtered_chunks)} - before = {len(chunks)}")
187
  new_chunks = []
 
580
  if file_path.endswith('.zip'):
581
  extracted_files = extract_zip(file_path)
582
  processed_files.extend(extracted_files)
583
+ base_folders.extend([os.path.splitext(os.path.basename(file_path))[0] * len(extracted_files)])
584
  else:
585
  processed_files.append(file_path)
586
  base_folders.append("")