YchKhan commited on
Commit
9ea18b7
1 Parent(s): 410e3c8

Update split_files_to_excel.py

Browse files
Files changed (1) hide show
  1. split_files_to_excel.py +17 -8
split_files_to_excel.py CHANGED
@@ -455,7 +455,7 @@ def split_chunks_by_tokens_period(documents, max_length=170, overlap=10, min_chu
455
 
456
  # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
457
 
458
- def split_doc_in_chunks(input_folder):
459
  docs = []
460
  for i, filename in enumerate(input_folder):
461
  path = filename#os.path.join(input_folder, filename)
@@ -465,8 +465,10 @@ def split_doc_in_chunks(input_folder):
465
  if path.endswith(".pdf"):
466
  try:
467
  print("Treatment of pdf file", path)
468
- raw_chuncks = split_pdf(path, input_folder)
469
- chunks = group_chunks_by_section(raw_chuncks)
 
 
470
  print(f"Document splitted in {len(chunks)} chunks")
471
  # for chunk in chunks:
472
  # print(f"\n\n____\n\n\nPDF CONTENT: \n{chunk.page_content}\ntitle: {chunk.metadata['title']}\nFile Name: {chunk.metadata['filename']}\n\n")
@@ -475,9 +477,11 @@ def split_doc_in_chunks(input_folder):
475
  elif path.endswith(".docx"):
476
  try:
477
  print ("Treatment of docx file", path)
478
- raw_chuncks = split_docx(path, input_folder)
479
- #print(f"RAW :\n***\n{raw_chuncks}")
480
- chunks = group_chunks_by_section(raw_chuncks)
 
 
481
  print(f"Document splitted in {len(chunks)} chunks")
482
  #if "cards-Jan 2022-SP.docx" in path:
483
  #for chunk in chunks:
@@ -496,6 +500,7 @@ def split_doc_in_chunks(input_folder):
496
  chunk.metadata["filename"] = filename.split("/")[-1]
497
  chunk.metadata["file_directory"] = filename.split("/")[:-1]
498
  chunk.metadata["filetype"] = filename.split(".")[-1]
 
499
  if "page" in chunk.metadata:
500
  counter[chunk.metadata['page']] += 1
501
  for i in range(len(chunks)):
@@ -566,15 +571,18 @@ def extract_zip(zip_path):
566
 
567
  def split_in_df(files):
568
  processed_files = []
 
569
  print("Processing zip files...")
570
  for file_path in files:
571
  if file_path.endswith('.zip'):
572
  extracted_files = extract_zip(file_path)
573
  processed_files.extend(extracted_files)
 
574
  else:
575
  processed_files.append(file_path)
576
- print("Finished processing zip files\Splitting files into chunks...")
577
- documents = split_doc_in_chunks(processed_files)
 
578
  re_docs = resplit_by_end_of_sentence(documents, 1000, 100, 1500)
579
  print("Finished splitting")
580
  df = pd.DataFrame()
@@ -590,6 +598,7 @@ def split_in_df(files):
590
 
591
  doc_data["Token_Length"] = re_doc.metadata['token_length']
592
  doc_data["Titles"] = re_doc.metadata['titles'] if 'titles' in re_doc.metadata else ""
 
593
 
594
  # for key, value in zip(metadata_keys, metadata_values):
595
  # doc_data[key] = value
 
455
 
456
  # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
457
 
458
+ def split_doc_in_chunks(input_folder, base_folders):
459
  docs = []
460
  for i, filename in enumerate(input_folder):
461
  path = filename#os.path.join(input_folder, filename)
 
465
  if path.endswith(".pdf"):
466
  try:
467
  print("Treatment of pdf file", path)
468
+ raw_chunks = split_pdf(path, input_folder)
469
+ for raw_chunk in raw_chunks:
470
+ raw_chunk.metadata["Base Folder"] = base_folders[i]
471
+ chunks = group_chunks_by_section(raw_chunks)
472
  print(f"Document splitted in {len(chunks)} chunks")
473
  # for chunk in chunks:
474
  # print(f"\n\n____\n\n\nPDF CONTENT: \n{chunk.page_content}\ntitle: {chunk.metadata['title']}\nFile Name: {chunk.metadata['filename']}\n\n")
 
477
  elif path.endswith(".docx"):
478
  try:
479
  print ("Treatment of docx file", path)
480
+ raw_chunks = split_docx(path, input_folder)
481
+ for raw_chunk in raw_chunks:
482
+ raw_chunk.metadata["Base Folder"] = base_folders[i]
483
+ #print(f"RAW :\n***\n{raw_chunks}")
484
+ chunks = group_chunks_by_section(raw_chunks)
485
  print(f"Document splitted in {len(chunks)} chunks")
486
  #if "cards-Jan 2022-SP.docx" in path:
487
  #for chunk in chunks:
 
500
  chunk.metadata["filename"] = filename.split("/")[-1]
501
  chunk.metadata["file_directory"] = filename.split("/")[:-1]
502
  chunk.metadata["filetype"] = filename.split(".")[-1]
503
+ chunk.metadata["Base Folder"] = base_folders[i]
504
  if "page" in chunk.metadata:
505
  counter[chunk.metadata['page']] += 1
506
  for i in range(len(chunks)):
 
571
 
572
  def split_in_df(files):
573
  processed_files = []
574
+ base_folders = []
575
  print("Processing zip files...")
576
  for file_path in files:
577
  if file_path.endswith('.zip'):
578
  extracted_files = extract_zip(file_path)
579
  processed_files.extend(extracted_files)
580
+ base_folders.append(os.path.splitext(os.path.basename(file_path))[0])
581
  else:
582
  processed_files.append(file_path)
583
+ base_folders.append("")
584
+ print("Finished processing zip files\nSplitting files into chunks...")
585
+ documents = split_doc_in_chunks(processed_files, base_folders)
586
  re_docs = resplit_by_end_of_sentence(documents, 1000, 100, 1500)
587
  print("Finished splitting")
588
  df = pd.DataFrame()
 
598
 
599
  doc_data["Token_Length"] = re_doc.metadata['token_length']
600
  doc_data["Titles"] = re_doc.metadata['titles'] if 'titles' in re_doc.metadata else ""
601
+ doc_data["Base Folder"] = re_doc.metadata["Base Folder"]
602
 
603
  # for key, value in zip(metadata_keys, metadata_values):
604
  # doc_data[key] = value