Spaces:
Sleeping
Sleeping
Update split_files_to_excel.py
Browse files- split_files_to_excel.py +2 -2
split_files_to_excel.py
CHANGED
@@ -181,7 +181,7 @@ def create_documents(source, snippets, font_sizes):
|
|
181 |
## Group Chunks docx or pdf
|
182 |
|
183 |
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
|
184 |
-
def group_chunks_by_section(chunks, min_chunk_size=
|
185 |
filtered_chunks = [chunk for chunk in chunks if chunk.metadata['category'] != 'PageBreak']# Add more filters if needed
|
186 |
#print(f"filtered = {len(filtered_chunks)} - before = {len(chunks)}")
|
187 |
new_chunks = []
|
@@ -580,7 +580,7 @@ def split_in_df(files):
|
|
580 |
if file_path.endswith('.zip'):
|
581 |
extracted_files = extract_zip(file_path)
|
582 |
processed_files.extend(extracted_files)
|
583 |
-
base_folders.
|
584 |
else:
|
585 |
processed_files.append(file_path)
|
586 |
base_folders.append("")
|
|
|
181 |
## Group Chunks docx or pdf
|
182 |
|
183 |
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
|
184 |
+
def group_chunks_by_section(chunks, min_chunk_size=64):
|
185 |
filtered_chunks = [chunk for chunk in chunks if chunk.metadata['category'] != 'PageBreak']# Add more filters if needed
|
186 |
#print(f"filtered = {len(filtered_chunks)} - before = {len(chunks)}")
|
187 |
new_chunks = []
|
|
|
580 |
if file_path.endswith('.zip'):
|
581 |
extracted_files = extract_zip(file_path)
|
582 |
processed_files.extend(extracted_files)
|
583 |
+
base_folders.extend([os.path.splitext(os.path.basename(file_path))[0] * len(extracted_files)])
|
584 |
else:
|
585 |
processed_files.append(file_path)
|
586 |
base_folders.append("")
|