YchKhan commited on
Commit
d425ddf
1 Parent(s): 5df3669

Update split_files_to_excel.py

Browse files
Files changed (1) hide show
  1. split_files_to_excel.py +18 -1
split_files_to_excel.py CHANGED
@@ -477,8 +477,25 @@ def build_index(docs, index, output_folder):
477
  output_folder.upload_file(f, os.path.join(temp_dir, f))
478
 
479
 
 
 
 
 
 
 
 
 
480
  def split_in_df(files):
481
- documents = split_doc_in_chunks(files)
 
 
 
 
 
 
 
 
 
482
  df = pd.DataFrame()
483
  for document in documents:
484
  filename = document.metadata['filename']
 
477
  output_folder.upload_file(f, os.path.join(temp_dir, f))
478
 
479
 
480
+ def extract_zip(zip_path):
481
+ extracted_files = []
482
+ with zipfile.ZipFile(zip_path, 'r') as zip_ref:
483
+ for file_info in zip_ref.infolist():
484
+ extracted_files.append(file_info.filename)
485
+ zip_ref.extract(file_info.filename)
486
+ return extracted_files
487
+
488
  def split_in_df(files):
489
+ print("Processing zip files...")
490
+ for file_path in files:
491
+ if file_path.endswith('.zip'):
492
+ extracted_files = extract_zip(file_path)
493
+ processed_files.extend(extracted_files)
494
+ else:
495
+ processed_files.append(file_path)
496
+ print("Finished processing zip files\Splitting files into chunks...")
497
+ documents = split_doc_in_chunks(processed_files)
498
+ print("Finished splitting")
499
  df = pd.DataFrame()
500
  for document in documents:
501
  filename = document.metadata['filename']