heymenn commited on
Commit
3f5d008
1 Parent(s): 8e35445

Update split_files_to_excel.py

Browse files
Files changed (1) hide show
  1. split_files_to_excel.py +139 -4
split_files_to_excel.py CHANGED
@@ -25,7 +25,8 @@ from pypdf import PdfReader
25
 
26
  import pandas as pd
27
 
28
-
 
29
 
30
  MODEL = "thenlper/gte-base"
31
  CHUNK_SIZE = 1000
@@ -530,12 +531,42 @@ def split_in_df(files):
530
  # -------------------------------------------------------------------------------- SPLIT FILES BY KEYWORDS
531
 
532
  def split_by_keywords(files, key_words, words_limit=1000):
 
533
  extracted_content = []
534
-
535
  tabLine = []
536
- for file in files:
537
 
538
- if file.endswith('pdf'):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
539
  file_name = file
540
  file = PdfReader(file)
541
  pdfNumberPages = len(file.pages)
@@ -629,6 +660,9 @@ def split_by_keywords(files, key_words, words_limit=1000):
629
  tabLine.append([file_name, selectedText, key])
630
  print(f"Selected line in keywords is: {line}")
631
 
 
 
 
632
  for r in tabLine:
633
  text_joined = ''.join(r[1])
634
  text_joined = r[2] + " : \n " + text_joined
@@ -654,3 +688,104 @@ def split_by_keywords(files, key_words, words_limit=1000):
654
 
655
  return "dataframe_keywords.xlsx"
656
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  import pandas as pd
27
 
28
+ import requests
29
+ import json
30
 
31
  MODEL = "thenlper/gte-base"
32
  CHUNK_SIZE = 1000
 
531
  # -------------------------------------------------------------------------------- SPLIT FILES BY KEYWORDS
532
 
533
  def split_by_keywords(files, key_words, words_limit=1000):
534
+ processed_files = []
535
  extracted_content = []
 
536
  tabLine = []
 
537
 
538
+ # For each files : stock the PDF, extract the Zips and convert the Doc & Docx to PDF
539
+ try:
540
+ not_duplicate = True
541
+ for f in files:
542
+ for p in processed_files:
543
+ if (f[:f.rfind('.')] == p[:p.rfind('.')]):
544
+ not_duplicate = False
545
+ if not_duplicate:
546
+ if f.endswith('.zip'):
547
+ extracted_files = extract_zip(f)
548
+ print(f"Those are my extracted files{extracted_files}")
549
+
550
+ for doc in extracted_files:
551
+ if doc.endswith('.doc') or doc.endswith('.docx'):
552
+ processed_files.append(transform_to_pdf(doc))
553
+
554
+ if doc.endswith('.pdf'):
555
+ processed_files.append(doc)
556
+
557
+ if f.endswith('.pdf'):
558
+ processed_files.append(f)
559
+
560
+ if f.endswith('.doc') or f.endswith('.docx'):
561
+ processed_files.append(transform_to_pdf(f))
562
+
563
+ except Exception as ex:
564
+ print(f"Error occured while processing files : {ex}")
565
+
566
+ # For each processed files extract content
567
+ for file in processed_files:
568
+
569
+ try:
570
  file_name = file
571
  file = PdfReader(file)
572
  pdfNumberPages = len(file.pages)
 
660
  tabLine.append([file_name, selectedText, key])
661
  print(f"Selected line in keywords is: {line}")
662
 
663
+ except Exception as ex:
664
+ print(f"Error occured while extracting content : {ex}")
665
+
666
  for r in tabLine:
667
  text_joined = ''.join(r[1])
668
  text_joined = r[2] + " : \n " + text_joined
 
688
 
689
  return "dataframe_keywords.xlsx"
690
 
691
+ # -------------------------------------------------------------------------------- NON INTELLIGENT SPLIT
692
+
693
+ def transform_to_pdf(doc):
694
+ instructions = {'parts': [{'file': 'document'}]}
695
+
696
+ response = requests.request(
697
+ 'POST',
698
+ 'https://api.pspdfkit.com/build',
699
+ headers = { 'Authorization': 'Bearer pdf_live_nS6tyylSW57PNw9TIEKKL3Tt16NmLCazlQWQ9D33t0Q'},
700
+ files = {'document': open(doc, 'rb')},
701
+ data = {'instructions': json.dumps(instructions)},
702
+ stream = True
703
+ )
704
+
705
+ pdf_name = doc[:doc.find(".doc")] + ".pdf"
706
+
707
+ if response.ok:
708
+ with open(pdf_name, 'wb') as fd:
709
+ for chunk in response.iter_content(chunk_size=8096):
710
+ fd.write(chunk)
711
+ return pdf_name
712
+
713
+ else:
714
+ print(response.text)
715
+ exit()
716
+ return none
717
+
718
+
719
+ def non_intelligent_split(files, chunk_size = 1000):
720
+ extracted_content = []
721
+ processed_files = []
722
+
723
+
724
+ # For each files : stock the PDF, extract the Zips and convert the Doc & Docx to PDF
725
+ try:
726
+ not_duplicate = True
727
+ for f in files:
728
+ for p in processed_files:
729
+ if (f[:f.rfind('.')] == p[:p.rfind('.')]):
730
+ not_duplicate = False
731
+ if not_duplicate:
732
+ if f.endswith('.zip'):
733
+ extracted_files = extract_zip(f)
734
+ print(f"Those are my extracted files{extracted_files}")
735
+
736
+ for doc in extracted_files:
737
+ if doc.endswith('.doc') or doc.endswith('.docx'):
738
+ processed_files.append(transform_to_pdf(doc))
739
+
740
+ if doc.endswith('.pdf'):
741
+ processed_files.append(doc)
742
+
743
+ if f.endswith('.pdf'):
744
+ processed_files.append(f)
745
+
746
+ if f.endswith('.doc') or f.endswith('.docx'):
747
+ processed_files.append(transform_to_pdf(f))
748
+
749
+ except Exception as ex:
750
+ print(f"Error occured while processing files : {ex}")
751
+
752
+ # Extract content from each processed files
753
+ try:
754
+ for f in processed_files:
755
+ print(f"my filename is : {f}")
756
+ file = PdfReader(f)
757
+ pdfNumberPages = len(file.pages)
758
+ selectedText = ""
759
+
760
+ for pdfPage in range(0, pdfNumberPages):
761
+ load_page = file.get_page(pdfPage)
762
+ text = load_page.extract_text()
763
+ lines = text.split("\n")
764
+ sizeOfLines = 0
765
+
766
+ for index, line in enumerate(lines):
767
+ sizeOfLines += len(line)
768
+ selectedText += " " + line
769
+ if sizeOfLines >= chunk_size:
770
+ textContent = (f"Page {str(pdfPage)} : {selectedText}")
771
+ extracted_content.append([f, textContent])
772
+ sizeOfLines = 0
773
+ selectedText = ""
774
+
775
+ textContent = (f"Page {str(pdfNumberPages)} : {selectedText}")
776
+ extracted_content.append([f, textContent])
777
+ except Exception as ex:
778
+ print(f"Error occured while extracting content from processed files : {ex}")
779
+
780
+ df = pd.DataFrame()
781
+ for content in extracted_content:
782
+ filename = content[0]
783
+ text = content[1]
784
+
785
+ doc_data = {'Filename': filename[filename.rfind("/")+1:], 'Content': text}
786
+
787
+ df = pd.concat([df, pd.DataFrame([doc_data])], ignore_index=True)
788
+
789
+ df.to_excel("dataframe_keywords.xlsx", index=False)
790
+
791
+ return "dataframe_keywords.xlsx"